From c4c818b843611b0578d25c4978a9d58eb9b49307 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Tue, 12 Jul 2022 17:12:13 -0700
Subject: [PATCH 01/55] paser_mht

---
 hls4ml/converters/keras/multiheadattention.py | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 hls4ml/converters/keras/multiheadattention.py

diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
new file mode 100644
index 0000000000..a3aed6dfa0
--- /dev/null
+++ b/hls4ml/converters/keras/multiheadattention.py
@@ -0,0 +1,41 @@
+from hls4ml.converters.keras_to_hls import parse_default_keras_layer
+from hls4ml.converters.keras_to_hls import keras_handler
+
+@keras_handler('MultiHeadAttention')
+def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    # assume input_shapes is: [[None, seq, dim]]
+    assert('MultiHeadAttention' in keras_layer['class_name'])
+    assert(input_shapes[0]==keras_layer['config']['query_shape'])
+    
+    layer = parse_default_keras_layer(keras_layer, input_names)
+    
+    layer['num_heads'] = keras_layer['config']['num_heads']
+    layer['head_dim_key'] = keras_layer['config']['key_dim']
+    layer['head_dim_value'] = keras_layer['config']['value_dim']
+    layer['query_shape'] = keras_layer['config']['query_shape']
+    layer['key_shape'] = keras_layer['config']['key_shape']
+    layer['value_shape'] = keras_layer['config']['value_shape']
+    layer['feature_dim'] = layer['query_shape'][-1]
+    # seq_length is not included, because it varies each time. 
+    # the maxium of seq_length is not defined in the config file.
+
+    if keras_layer['config']['output_shape']: 
+        # out_shape = keras_layer['config']['output_shape']
+        # out_shape = (layer['query_shape'][:2]).extend(out_shape)
+        raise Exception('hls4ml does not support a defined output shape, the output shape must equal to the query shape')
+    else:  # by default output shape in config is False, and thus select the output shape equal query shape
+        out_shape = layer['query_shape']
+        
+    output_shape = layer['output_shape']
+    
+    layer['attention_axes'] = keras_layer['config']['attention_axes'] if (keras_layer['config']['attention_axes'][0]==1) else False
+    if layer['attention_axes'] is False: 
+        raise Exception('assigning the attention_axe is not currently supported by hls4ml')
+
+    if not((len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3):
+        raise Exception('muti-dimension of feature dim is not currently supported by hls4ml')
+
+    attn_scores_rank = 4
+    layer['softmax_axis'] = tuple(range(attn_scores_rank - len(layer['attention_axes']), attn_scores_rank ))
+
+    return layer, output_shape
\ No newline at end of file

From 3ee64d1708fb0bde696588a9e638382b18973426 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Tue, 12 Jul 2022 21:12:13 -0700
Subject: [PATCH 02/55] change parser and modify keras_to_hls

---
 hls4ml/converters/keras/multiheadattention.py |  8 +++-----
 hls4ml/converters/keras_to_hls.py             | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
index a3aed6dfa0..51e041e9be 100644
--- a/hls4ml/converters/keras/multiheadattention.py
+++ b/hls4ml/converters/keras/multiheadattention.py
@@ -20,13 +20,11 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader, conf
     # the maxium of seq_length is not defined in the config file.
 
     if keras_layer['config']['output_shape']: 
-        # out_shape = keras_layer['config']['output_shape']
-        # out_shape = (layer['query_shape'][:2]).extend(out_shape)
+        # output_shape = keras_layer['config']['output_shape']
+        # output_shape = (layer['query_shape'][:2]).extend(out_shape)
         raise Exception('hls4ml does not support a defined output shape, the output shape must equal to the query shape')
     else:  # by default output shape in config is False, and thus select the output shape equal query shape
-        out_shape = layer['query_shape']
-        
-    output_shape = layer['output_shape']
+        output_shape = layer['query_shape']
     
     layer['attention_axes'] = keras_layer['config']['attention_axes'] if (keras_layer['config']['attention_axes'][0]==1) else False
     if layer['attention_axes'] is False: 
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index f1150be15e..433d095598 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -283,7 +283,12 @@ def parse_keras_model(model_arch, reader):
 
         # Extract inbound nodes
         if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
-            input_names = [inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0]]
+            input_names = [ inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0] ] # why using inputs_map.get?
+            if keras_layer['inbound_nodes'][0][0][-1]: # multi_head_attention  has inbound: [[['input_3', 0, 0, {'value': ['dense_3', 0, 0]}]]]
+                inputname2 = list(keras_layer['inbound_nodes'][0][0][-1].values())
+                input_names+=[inp[0] for inp in inputname2]
+            # print("input_names: ", input_names)
+
         else:
             input_names = None
 
@@ -334,5 +339,10 @@ def keras_to_hls(config):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
     print('Creating HLS model')
-    hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
+                            # config, a dict, has key: 'HLSConfig' and 'KerasModel'
+                            # reader can read all the weight and bias
+                            # layer_list a list of dict, each element is one layer output from parser
+                            # input_layers, a list
+                            # output layers, a list
+    hls_model = ModelGraph(config, reader, layer_list, input_layers, output_layers)
     return hls_model

From 5626a1a5a4719a407e354cfa560e680d4651d25e Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Thu, 14 Jul 2022 13:46:03 -0700
Subject: [PATCH 03/55] IR_mutihead_attention

---
 hls4ml/converters/keras/multiheadattention.py |  2 +-
 hls4ml/model/layers.py                        | 77 ++++++++++++++++++-
 2 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
index 51e041e9be..67ba16895f 100644
--- a/hls4ml/converters/keras/multiheadattention.py
+++ b/hls4ml/converters/keras/multiheadattention.py
@@ -34,6 +34,6 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader, conf
         raise Exception('muti-dimension of feature dim is not currently supported by hls4ml')
 
     attn_scores_rank = 4
-    layer['softmax_axis'] = tuple(range(attn_scores_rank - len(layer['attention_axes']), attn_scores_rank ))
+    layer['softmax_axis'] = list(range(attn_scores_rank - len(layer['attention_axes']), attn_scores_rank ))
 
     return layer, output_shape
\ No newline at end of file
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index d8d1fb9c8f..d54bc7c498 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1071,7 +1071,7 @@ def initialize(self):
 class SimpleRNN(Layer):
     _expected_attributes = [
         Attribute('n_out'),
-        Attribute('activation', value_type=str),
+        Attribute('activation', value_type=str), 
         Attribute('return_sequences', value_type=bool, default=False),
         Attribute('return_state', value_type=bool, default=False),
         ChoiceAttribute('direction', ['forward', 'backward'], default='forward'),
@@ -1118,11 +1118,11 @@ def initialize(self):
 class LSTM(Layer):
     _expected_attributes = [
         Attribute('n_out'),
-        Attribute('activation', value_type=str),
+        Attribute('activation', value_type=str),  ## not defined in the paser, when do we need this data
         Attribute('recurrent_activation', value_type=str),
         Attribute('return_sequences', value_type=bool, default=False),
         Attribute('return_state', value_type=bool, default=False),
-        ChoiceAttribute('direction', ['forward', 'backward'], default='forward'),
+        ChoiceAttribute('direction', ['forward', 'backward'], default='forward'), ## not defined in the paser, how can we know when to be backward
         Attribute('time_major', value_type=bool, default=False),
         WeightAttribute('weight'),
         WeightAttribute('bias'),
@@ -1413,6 +1413,75 @@ def initialize(self):
         self.add_output_variable([len(self.get_attr('expression'))], [f'N_OUTPUTS_{self.index}'], var_name='y')
 
 
+class MultiHeadAttention(Layer):
+    _expected_attributes = [
+        # does Attribute define the i/o for the template?
+        Attribute('num_heads'),
+        Attribute('head_dim_key'),
+        Attribute('head_dim_value'),
+        # Attribute('query_shape'),  // I guess we do not need shape here?
+        # Attribute('key_shape'),
+        # Attribute('value_shape'),
+        Attribute('feature_dim'), # 'feature_dim' is n_out and n_in
+        # Attribute('attention_axes'),
+        # Attribute('softmax_axis'),
+
+        WeightAttribute('attention_output_weight'),
+        WeightAttribute('attention_output_bias'),
+        WeightAttribute('key_weight'),
+        WeightAttribute('key_bias'),
+        WeightAttribute('query_weight'),
+        WeightAttribute('query_bias'),
+        WeightAttribute('value_weight'),
+        WeightAttribute('value_bias'),
+
+        TypeAttribute('attention_output_weight'),
+        TypeAttribute('attention_output_bias'),
+        TypeAttribute('key_weight'),
+        TypeAttribute('key_bias'),
+        TypeAttribute('query_weight'),
+        TypeAttribute('query_bias'),
+        TypeAttribute('value_weight'),
+        TypeAttribute('value_bias'),
+    ]
+
+    def initialize(self):
+
+        # is these two needed?
+        # self.add_weights()
+        # self.add_bias()
+
+        recurrent_weight = self.model.get_weights_data(self.name, 'recurrent_kernel')
+        self.add_weights_variable(name='recurrent_weight', var_name='wr{index}', data=recurrent_weight)
+    
+        weights_source = [
+                ('attention_output', 'kernel'),
+                ('attention_output', 'bias'),
+                ('key', 'kernel'),
+                ('key', 'bias'),
+                ('query', 'kernel'),
+                ('query', 'bias'),
+                ('value', 'kernel'),
+                ('value', 'bias'),
+            ]
+        
+        for lname, wtype in weights_source:                            ##/
+            data = self.model.get_weights_data(self.name, '{name}/{lname}_{wtype}:0'.format(name=self.name, lname=lname, wtype=wtype))
+            if wtype == 'kernel':
+                # data = data.transpose((1, 0)) # reshaping need a discussion
+                vtype = 'weights'
+            else:
+                vtype = 'biases'
+
+            name = '{}_{}'.format(lname, vtype)
+            var_name = '{}_{}{{index}}'.format(lname, vtype)
+
+            self._add_variable(name, var_name, data, frac_width=10, quantize=False) # how to decide frac_width
+        
+        shape = self.attributes['query_shape'][1:]  ## how to deal with the case that seq_len is undefined?
+        dims = ['seq_out_{}'.format(self.index), 'feature_out_{}'.format(self.index)]
+        self.add_output_variable(shape, dims)
+
 layer_map = {
     'Input': Input,
     'InputLayer': Input,
@@ -1473,6 +1542,8 @@ def initialize(self):
     'GarNetStack': GarNetStack,
     'LayerGroup': LayerGroup,
     'SymbolicExpression': SymbolicExpression,
+    'MultiHeadAttention'     : MultiHeadAttention,
+
     # TensorFlow-specific layers:
     'BiasAdd': BiasAdd,
 }

From d51f8a9e755b3f89cf91a71a4ba0a43f38a3bfa8 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Fri, 15 Jul 2022 14:07:33 -0700
Subject: [PATCH 04/55] IR done

---
 hls4ml/model/layers.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index d54bc7c498..3e9a6de98d 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1451,9 +1451,6 @@ def initialize(self):
         # self.add_weights()
         # self.add_bias()
 
-        recurrent_weight = self.model.get_weights_data(self.name, 'recurrent_kernel')
-        self.add_weights_variable(name='recurrent_weight', var_name='wr{index}', data=recurrent_weight)
-    
         weights_source = [
                 ('attention_output', 'kernel'),
                 ('attention_output', 'bias'),
@@ -1465,20 +1462,18 @@ def initialize(self):
                 ('value', 'bias'),
             ]
         
-        for lname, wtype in weights_source:                            ##/
-            data = self.model.get_weights_data(self.name, '{name}/{lname}_{wtype}:0'.format(name=self.name, lname=lname, wtype=wtype))
+        for lname, wtype in weights_source:                         
+            data = self.model.get_weights_data(self.name, '{lname}/{wtype}'.format(lname=lname, wtype=wtype))
             if wtype == 'kernel':
-                # data = data.transpose((1, 0)) # reshaping need a discussion
-                vtype = 'weights'
+                vtype = 'weight'
             else:
-                vtype = 'biases'
+                vtype = 'bias'
 
             name = '{}_{}'.format(lname, vtype)
             var_name = '{}_{}{{index}}'.format(lname, vtype)
-
-            self._add_variable(name, var_name, data, frac_width=10, quantize=False) # how to decide frac_width
+            self.add_weights_variable(name=name, var_name=var_name, data=data)
         
-        shape = self.attributes['query_shape'][1:]  ## how to deal with the case that seq_len is undefined?
+        shape = self.attributes['query_shape'][1:]
         dims = ['seq_out_{}'.format(self.index), 'feature_out_{}'.format(self.index)]
         self.add_output_variable(shape, dims)
 

From 89025a2be63f5ee49f455a8c09cb2e8b2f788c13 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Mon, 18 Jul 2022 17:36:17 -0700
Subject: [PATCH 05/55] create mha file in template

---
 hls4ml/model/graph.py                         |  3 +-
 hls4ml/model/layers.py                        |  5 --
 .../nnet_utils/nnet_multiheadattention.h      | 63 +++++++++++++++++++
 3 files changed, 65 insertions(+), 6 deletions(-)
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h

diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index d0a1fdf7fc..8138d91bb3 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -588,7 +588,8 @@ def remove_node(self, node, rewire=True):
                                 next_node.inputs[i] = prev_node.outputs[0]
                                 break
                 else:
-                    if not node.outputs[0] in self.outputs:
+                    # if not node.outputs[0] in self.outputs: ## would this be the key output_vars? because the self.outputs is the model final output
+                    if not node.outputs[0] in self.output_vars.keys(): ## my change
                         raise Exception('Cannot rewire a node without child')
             else:
                 raise Exception('Cannot rewire a node without a parent')
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 3e9a6de98d..aff84cb86d 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1419,12 +1419,7 @@ class MultiHeadAttention(Layer):
         Attribute('num_heads'),
         Attribute('head_dim_key'),
         Attribute('head_dim_value'),
-        # Attribute('query_shape'),  // I guess we do not need shape here?
-        # Attribute('key_shape'),
-        # Attribute('value_shape'),
         Attribute('feature_dim'), # 'feature_dim' is n_out and n_in
-        # Attribute('attention_axes'),
-        # Attribute('softmax_axis'),
 
         WeightAttribute('attention_output_weight'),
         WeightAttribute('attention_output_bias'),
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
new file mode 100644
index 0000000000..2e13aab1a9
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -0,0 +1,63 @@
+#ifndef NNET_MHT_H_
+#define NNET_MHT_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_dense.h"
+#include "hls_stream.h"
+#include <math.h>
+
+namespace nnet {
+
+struct multiheadattention_config
+{
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;  // where this type will be used
+
+    // Layer Sizes
+    // static const unsigned n_in = 10;
+    // static const unsigned n_out = 10;
+    static const unsigned num_heads = 10;
+    static const unsigned head_dim_key = 10;
+    static const unsigned head_dim_value = 10;
+    static const unsigned feature_dim = 10;
+
+    // Resource reuse info  // not sure how to write this part
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency; 
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;  // where is defined? meaning?
+    static const bool use_static = true; // where is defined? meaning?
+    
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
+};
+
+template<class data_T, class res_T, typename CONFIG_T>
+void multiheadattention(
+    data_T    data_q[CONFIG_T::n_in],
+    data_T    data_vk[CONFIG_T::n_in],
+    res_T     res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::xxx*CONFIG_T::xxx],
+    typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::xxx],
+    typename CONFIG_T::weight_t  key_weight[CONFIG_T::xxx*CONFIG_T::xxx],
+    typename CONFIG_T::bias_t    key_bias[CONFIG_T::xxx],
+    typename CONFIG_T::weight_t  query_weight[CONFIG_T::xxx*CONFIG_T::xxx],
+    typename CONFIG_T::bias_t    query_bias[CONFIG_T::xxx],
+    typename CONFIG_T::weight_t  value_weight[CONFIG_T::xxx*CONFIG_T::xxx],
+    typename CONFIG_T::bias_t    value_bias[CONFIG_T::xxx])
+{
+    #pragma HLS inline
+    if (CONFIG_T::strategy == nnet::latency) {
+        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+}
+
+#endif

From d76cf60efc967bbc61b6cc4baec5d75b2355bb86 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Tue, 19 Jul 2022 12:24:23 -0700
Subject: [PATCH 06/55] mha .h file dummy algo

---
 hls4ml/converters/keras/multiheadattention.py |  3 +-
 hls4ml/model/layers.py                        |  3 +-
 .../nnet_utils/nnet_multiheadattention.h      | 37 +++++++++----------
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
index 67ba16895f..bc2120ae0f 100644
--- a/hls4ml/converters/keras/multiheadattention.py
+++ b/hls4ml/converters/keras/multiheadattention.py
@@ -16,8 +16,7 @@ def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader, conf
     layer['key_shape'] = keras_layer['config']['key_shape']
     layer['value_shape'] = keras_layer['config']['value_shape']
     layer['feature_dim'] = layer['query_shape'][-1]
-    # seq_length is not included, because it varies each time. 
-    # the maxium of seq_length is not defined in the config file.
+    layer['seq_len'] = layer['query_shape'][-2]
 
     if keras_layer['config']['output_shape']: 
         # output_shape = keras_layer['config']['output_shape']
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index aff84cb86d..5ea544f13b 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1419,7 +1419,8 @@ class MultiHeadAttention(Layer):
         Attribute('num_heads'),
         Attribute('head_dim_key'),
         Attribute('head_dim_value'),
-        Attribute('feature_dim'), # 'feature_dim' is n_out and n_in
+        Attribute('feature_dim'),
+        Attribute('seq_len'),
 
         WeightAttribute('attention_output_weight'),
         WeightAttribute('attention_output_bias'),
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 2e13aab1a9..2ad7b6a3a4 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -29,8 +29,8 @@ struct multiheadattention_config
     static const unsigned strategy = latency; 
     static const unsigned reuse_factor = 1;
     static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;  // where is defined? meaning?
-    static const bool use_static = true; // where is defined? meaning?
+    static const unsigned n_zeros = 0;  // where is it defined? meaning?
+    static const bool use_static = true; // where is it defined? meaning?
     
     template<class x_T, class y_T>
     using product = nnet::product::mult<x_T, y_T>;
@@ -38,24 +38,23 @@ struct multiheadattention_config
 
 template<class data_T, class res_T, typename CONFIG_T>
 void multiheadattention(
-    data_T    data_q[CONFIG_T::n_in],
-    data_T    data_vk[CONFIG_T::n_in],
-    res_T     res[CONFIG_T::n_out],
-    typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::xxx*CONFIG_T::xxx],
-    typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::xxx],
-    typename CONFIG_T::weight_t  key_weight[CONFIG_T::xxx*CONFIG_T::xxx],
-    typename CONFIG_T::bias_t    key_bias[CONFIG_T::xxx],
-    typename CONFIG_T::weight_t  query_weight[CONFIG_T::xxx*CONFIG_T::xxx],
-    typename CONFIG_T::bias_t    query_bias[CONFIG_T::xxx],
-    typename CONFIG_T::weight_t  value_weight[CONFIG_T::xxx*CONFIG_T::xxx],
-    typename CONFIG_T::bias_t    value_bias[CONFIG_T::xxx])
+    data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    data_T    data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    res_T     res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],  // num_heads,head_size_v,dim
+    typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::feature_dim],
+    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key],  // (dim,num_head,head_size)
+    typename CONFIG_T::bias_t    key_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], //same shape as key
+    typename CONFIG_T::bias_t    query_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
+    typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
-    #pragma HLS inline
-    if (CONFIG_T::strategy == nnet::latency) {
-        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    // a dummy algo
+    for (i=0; i <= (CONFIG_T::seq_len * CONFIG_T::feature_dim); i++)
+        {
+            res[i] = data_q[i] + data_vk[i];
+        }
 }
 
 }

From 56811de54e4c4b1c1f49694886d8935ad03ecad2 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Wed, 20 Jul 2022 18:09:26 -0700
Subject: [PATCH 07/55] config of mha

---
 .../vivado/passes/transformer_templates.py    | 73 +++++++++++++++++++
 hls4ml/converters/keras/multiheadattention.py |  2 +-
 .../nnet_utils/nnet_multiheadattention.h      |  5 +-
 3 files changed, 75 insertions(+), 5 deletions(-)
 create mode 100644 hls4ml/backends/vivado/passes/transformer_templates.py

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
new file mode 100644
index 0000000000..06304013b7
--- /dev/null
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -0,0 +1,73 @@
+
+from hls4ml.backends.backend import get_backend
+from hls4ml.model.layers import MultiHeadAttention
+from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate
+                                                                  
+mha_config_template = """struct config{index} : nnet::multiHeadAttention_config {{ 
+    typedef {accum_t.name} accum_t;
+    typedef {attention_output_bias_t.name} bias_t;
+    typedef {attention_output_weight_t.name} weight_t;
+    
+    static const unsigned num_heads = {num_heads};
+    static const unsigned head_dim_key = {head_dim_key};
+    static const unsigned head_dim_value = {head_dim_value};
+    static const unsigned feature_dim = {feature_dim};
+    static const unsigned feature_dim = {seq_len};
+
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+
+}};\n"""
+#///////////////////////////////////
+mha_function_template = 'nnet::multiheadattention<{input_t}, {output_t}, {config}>({input_q}, {input_kv}, {output}, {w_o}, {b_o}, {w_k}, {b_k}, {w_q}, {b_q}, {w_v}, {b_v});'
+
+mha_include_list = ['nnet_utils/nnet_multiheadattention.h']
+
+class MhaConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(MultiHeadAttention)
+        self.template = mha_config_template
+        # self.mult1_template = recr_mult_config_template
+    
+    def format(self, node):
+
+        params = self._default_config_params(node)
+
+        params['num_heads'] = node.get_attr('num_heads')
+        params['head_dim_key'] = node.get_attr('head_dim_key')
+        params['head_dim_value'] = node.get_attr('head_dim_value')
+        params['feature_dim'] = node.get_attr('feature_dim')
+        params['seq_len'] = node.get_attr('seq_len')
+
+        mht_config = self.template.format(**params)
+
+        return mht_config
+
+class RecurrentFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(MultiHeadAttention, include_header=mha_include_list)
+        self.template = mha_function_template
+
+    def format(self, node):
+        params = {}
+        params.update(node.attributes)
+        params['config'] = 'config{}'.format(node.index)
+        params['input_t'] = node.get_input_variable().type.name
+        params['output_t'] = node.get_output_variable().type.name
+        
+        params['input_q'] = node.model.get_layer_output_variable(self.inputs[0]).name
+        params['input_kv'] = node.model.get_layer_output_variable(self.inputs[1]).name
+        params['output'] = node.get_output_variable().name
+        params['w_o'] = node.get_weights('attention_output_weight').name
+        params['b_o'] = node.get_weights('attention_output_bias').name
+        params['w_k'] = node.get_weights('key_weight').name
+        params['b_k'] = node.get_weights('key_bias').name
+        params['w_q'] = node.get_weights('query_weight').name
+        params['b_q'] = node.get_weights('query_bias').name
+        params['w_v'] = node.get_weights('value_weight').name
+        params['b_v'] = node.get_weights('value_bias').name
+
+        return self.template.format(**params)
+
diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
index bc2120ae0f..2b1e6322ce 100644
--- a/hls4ml/converters/keras/multiheadattention.py
+++ b/hls4ml/converters/keras/multiheadattention.py
@@ -2,7 +2,7 @@
 from hls4ml.converters.keras_to_hls import keras_handler
 
 @keras_handler('MultiHeadAttention')
-def parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader, config):
+def parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
     # assume input_shapes is: [[None, seq, dim]]
     assert('MultiHeadAttention' in keras_layer['class_name'])
     assert(input_shapes[0]==keras_layer['config']['query_shape'])
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 2ad7b6a3a4..1c8de94a3f 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -17,20 +17,17 @@ struct multiheadattention_config
     typedef float accum_t;  // where this type will be used
 
     // Layer Sizes
-    // static const unsigned n_in = 10;
-    // static const unsigned n_out = 10;
     static const unsigned num_heads = 10;
     static const unsigned head_dim_key = 10;
     static const unsigned head_dim_value = 10;
     static const unsigned feature_dim = 10;
+    static const unsigned seq_len = 500;
 
     // Resource reuse info  // not sure how to write this part
     static const unsigned io_type = io_parallel;
     static const unsigned strategy = latency; 
     static const unsigned reuse_factor = 1;
     static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;  // where is it defined? meaning?
-    static const bool use_static = true; // where is it defined? meaning?
     
     template<class x_T, class y_T>
     using product = nnet::product::mult<x_T, y_T>;

From 45cd493edcac06b1414cc309157c255f23b45c4a Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Wed, 20 Jul 2022 18:31:14 -0700
Subject: [PATCH 08/55] update mha config

---
 hls4ml/backends/vivado/passes/transformer_templates.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 06304013b7..8dcb0c44b0 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -15,7 +15,6 @@
     static const unsigned feature_dim = {seq_len};
 
     static const unsigned io_type = nnet::{iotype};
-    static const unsigned strategy = nnet::{strategy};
     static const unsigned reuse_factor = {reuse};
     static const bool store_weights_in_bram = false;
 
@@ -57,8 +56,8 @@ def format(self, node):
         params['input_t'] = node.get_input_variable().type.name
         params['output_t'] = node.get_output_variable().type.name
         
-        params['input_q'] = node.model.get_layer_output_variable(self.inputs[0]).name
-        params['input_kv'] = node.model.get_layer_output_variable(self.inputs[1]).name
+        params['input_q'] = node.model.get_layer_output_variable(node.inputs[0]).name
+        params['input_kv'] = node.model.get_layer_output_variable(node.inputs[1]).name
         params['output'] = node.get_output_variable().name
         params['w_o'] = node.get_weights('attention_output_weight').name
         params['b_o'] = node.get_weights('attention_output_bias').name

From 1402f48cd532c8bdbdc87135e44f987a1f466cdb Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Wed, 20 Jul 2022 21:39:08 -0700
Subject: [PATCH 09/55] dummy mha

---
 hls4ml/backends/vivado/passes/transformer_templates.py     | 7 +++----
 .../templates/vivado/nnet_utils/nnet_multiheadattention.h  | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 8dcb0c44b0..34eaaf1214 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -3,7 +3,7 @@
 from hls4ml.model.layers import MultiHeadAttention
 from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate
                                                                   
-mha_config_template = """struct config{index} : nnet::multiHeadAttention_config {{ 
+mha_config_template = """struct config{index} : nnet::multiheadattention_config {{ 
     typedef {accum_t.name} accum_t;
     typedef {attention_output_bias_t.name} bias_t;
     typedef {attention_output_weight_t.name} weight_t;
@@ -12,14 +12,13 @@
     static const unsigned head_dim_key = {head_dim_key};
     static const unsigned head_dim_value = {head_dim_value};
     static const unsigned feature_dim = {feature_dim};
-    static const unsigned feature_dim = {seq_len};
+    static const unsigned seq_len = {seq_len};
 
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
     static const bool store_weights_in_bram = false;
-
 }};\n"""
-#///////////////////////////////////
+
 mha_function_template = 'nnet::multiheadattention<{input_t}, {output_t}, {config}>({input_q}, {input_kv}, {output}, {w_o}, {b_o}, {w_k}, {b_k}, {w_q}, {b_q}, {w_v}, {b_v});'
 
 mha_include_list = ['nnet_utils/nnet_multiheadattention.h']
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 1c8de94a3f..275e5156e6 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -48,7 +48,7 @@ void multiheadattention(
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
     // a dummy algo
-    for (i=0; i <= (CONFIG_T::seq_len * CONFIG_T::feature_dim); i++)
+    for (int i=0; i <= (CONFIG_T::seq_len * CONFIG_T::feature_dim); i++)
         {
             res[i] = data_q[i] + data_vk[i];
         }

From 430b9eac96cbb3546f2818ebe000f143f9e1ff6f Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Sat, 23 Jul 2022 11:26:36 -0700
Subject: [PATCH 10/55] add transpose into mha

---
 hls4ml/model/layers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 5ea544f13b..cc09feefc7 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1415,7 +1415,6 @@ def initialize(self):
 
 class MultiHeadAttention(Layer):
     _expected_attributes = [
-        # does Attribute define the i/o for the template?
         Attribute('num_heads'),
         Attribute('head_dim_key'),
         Attribute('head_dim_value'),
@@ -1462,6 +1461,8 @@ def initialize(self):
             data = self.model.get_weights_data(self.name, '{lname}/{wtype}'.format(lname=lname, wtype=wtype))
             if wtype == 'kernel':
                 vtype = 'weight'
+                if lname in ['key', 'query', 'value']:
+                    data = data.transpose((1, 0, 2))
             else:
                 vtype = 'bias'
 

From 97f3e8dfde6f1bdb0d914ee3deca121a92d28a88 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Wed, 27 Jul 2022 12:53:56 -0700
Subject: [PATCH 11/55] projection_of_qkv_in_mha

---
 .../vivado/passes/transformer_templates.py    | 36 +++++++++++++++++--
 .../nnet_utils/nnet_multiheadattention.h      | 18 +++++++---
 2 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 34eaaf1214..2d836c37e6 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -7,6 +7,7 @@
     typedef {accum_t.name} accum_t;
     typedef {attention_output_bias_t.name} bias_t;
     typedef {attention_output_weight_t.name} weight_t;
+    typedef {config_mult_t1} mult_config1;
     
     static const unsigned num_heads = {num_heads};
     static const unsigned head_dim_key = {head_dim_key};
@@ -19,6 +20,24 @@
     static const bool store_weights_in_bram = false;
 }};\n"""
 
+# qkv projection dense layer template
+qkv_projection_config_template = """struct config{index} : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned n_nonzeros = {nonzeros};
+    static const bool store_weights_in_bram = false;
+    typedef {accum_t.name} accum_t;
+    typedef {bias_t.name} bias_t;
+    typedef {weight_t.name} weight_t;
+    typedef ap_{index_t} index_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+
 mha_function_template = 'nnet::multiheadattention<{input_t}, {output_t}, {config}>({input_q}, {input_kv}, {output}, {w_o}, {b_o}, {w_k}, {b_k}, {w_q}, {b_q}, {w_v}, {b_v});'
 
 mha_include_list = ['nnet_utils/nnet_multiheadattention.h']
@@ -27,7 +46,7 @@ class MhaConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(MultiHeadAttention)
         self.template = mha_config_template
-        # self.mult1_template = recr_mult_config_template
+        self.mult1_template = qkv_projection_config_template
     
     def format(self, node):
 
@@ -38,10 +57,21 @@ def format(self, node):
         params['head_dim_value'] = node.get_attr('head_dim_value')
         params['feature_dim'] = node.get_attr('feature_dim')
         params['seq_len'] = node.get_attr('seq_len')
-
+        params['config_mult_t1'] = 'config{}_1'.format(node.index)
         mht_config = self.template.format(**params)
 
-        return mht_config
+        mult_params1 = self._default_config_params(node)
+        mult_params1['n_in'] = node.get_attr('feature_dim')
+        mult_params1['n_out'] = node.get_attr('head_dim_key')
+        mult_params1['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('query_weight').type.precision)
+        mult_params1['reuse'] = params['reuse']
+        mult_params1['index'] = str(node.index)
+        mult_params1['nzeros'] = 0
+        mult_params1['nonzeros'] = node.get_weights('query_weight').nonzeros
+
+        mult_config1 = self.mult1_template.format(**mult_params1)
+
+        return mult_config1 + '\n' +mht_config
 
 class RecurrentFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 275e5156e6..f118d4750d 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -20,7 +20,7 @@ struct multiheadattention_config
     static const unsigned num_heads = 10;
     static const unsigned head_dim_key = 10;
     static const unsigned head_dim_value = 10;
-    static const unsigned feature_dim = 10;
+    static const unsigned feature_dim = 20;
     static const unsigned seq_len = 500;
 
     // Resource reuse info  // not sure how to write this part
@@ -47,13 +47,21 @@ void multiheadattention(
     typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
-    // a dummy algo
-    for (int i=0; i <= (CONFIG_T::seq_len * CONFIG_T::feature_dim); i++)
+    typename CONFIG_T::accum_t q_proj[CONFIG_T::num_heads][CONFIG_T::seq_len * CONFIG_T::head_dim_key];
+    typename CONFIG_T::accum_t v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len * CONFIG_T::head_dim_value];
+    typename CONFIG_T::accum_t k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len * CONFIG_T::head_dim_key];
+
+    // linear projection
+    for (int i=0; i <= CONFIG_T::num_heads; i++)
         {
-            res[i] = data_q[i] + data_vk[i];
+            for (int j=0; j <=CONFIG_T::seq_len; j++)
+            {
+            dense<data_T, res_T, typename CONFIG_T::mult_config1>(data_q,  q_proj[i]+(CONFIG_T::head_dim_key*j), query_weight+(CONFIG_T::head_dim_key * CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+            dense<data_T, res_T, typename CONFIG_T::mult_config1>(data_vk, v_proj[i]+(CONFIG_T::head_dim_value*j), value_weight+(CONFIG_T::head_dim_value * CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+            dense<data_T, res_T, typename CONFIG_T::mult_config1>(data_vk, k_proj[i]+(CONFIG_T::head_dim_key*j), key_weight+(CONFIG_T::head_dim_key * CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i));
+            }
         }
 }
-
 }
 
 #endif

From 52cc7e84760b4527c7a10bfa771202a3941a58ce Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Wed, 3 Aug 2022 19:45:53 -0700
Subject: [PATCH 12/55] mha_first_draft

---
 .../vivado/passes/transformer_templates.py    |  83 ++++++++----
 hls4ml/backends/vivado/vivado_backend.py      |  15 ++
 hls4ml/model/layers.py                        |   4 +-
 .../nnet_utils/nnet_multiheadattention.h      | 128 ++++++++++++++++--
 4 files changed, 192 insertions(+), 38 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 2d836c37e6..3ab86a56e9 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -2,13 +2,44 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.model.layers import MultiHeadAttention
 from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate
+
+#dense layer template
+mult_config_template = """struct config{index}_{mNum} : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned n_nonzeros = {nonzeros};
+    static const bool store_weights_in_bram = false;
+    typedef {accum_t.name} accum_t;
+    typedef {attention_output_bias_t.name} bias_t;
+    typedef {attention_output_weight_t.name} weight_t;
+    typedef ap_{index_t} index_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+#activation template
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    typedef ap_{table_t} exp_table_t;
+    typedef ap_{table_t} inv_table_t;
+}};\n"""
+
                                                                   
 mha_config_template = """struct config{index} : nnet::multiheadattention_config {{ 
     typedef {accum_t.name} accum_t;
     typedef {attention_output_bias_t.name} bias_t;
     typedef {attention_output_weight_t.name} weight_t;
-    typedef {config_mult_t1} mult_config1;
-    
+    typedef {config_mult_t1} config_mult1;
+    typedef {config_mult_t2} config_mult2;
+    typedef {config_activ_t1} softmax_config1;
+
     static const unsigned num_heads = {num_heads};
     static const unsigned head_dim_key = {head_dim_key};
     static const unsigned head_dim_value = {head_dim_value};
@@ -20,23 +51,6 @@
     static const bool store_weights_in_bram = false;
 }};\n"""
 
-# qkv projection dense layer template
-qkv_projection_config_template = """struct config{index} : nnet::dense_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_out = {n_out};
-    static const unsigned strategy = nnet::{strategy};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned n_zeros = {nzeros};
-    static const unsigned n_nonzeros = {nonzeros};
-    static const bool store_weights_in_bram = false;
-    typedef {accum_t.name} accum_t;
-    typedef {bias_t.name} bias_t;
-    typedef {weight_t.name} weight_t;
-    typedef ap_{index_t} index_t;
-    template<class x_T, class y_T>
-    using product = nnet::product::{product_type}<x_T, y_T>;
-}};\n"""
-
 
 mha_function_template = 'nnet::multiheadattention<{input_t}, {output_t}, {config}>({input_q}, {input_kv}, {output}, {w_o}, {b_o}, {w_k}, {b_k}, {w_q}, {b_q}, {w_v}, {b_v});'
 
@@ -46,21 +60,26 @@ class MhaConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(MultiHeadAttention)
         self.template = mha_config_template
-        self.mult1_template = qkv_projection_config_template
+        self.mult1_template = mult_config_template
+        self.mult2_template = mult_config_template
+        self.activ1_template = softmax_config_template
     
     def format(self, node):
 
         params = self._default_config_params(node)
-
         params['num_heads'] = node.get_attr('num_heads')
         params['head_dim_key'] = node.get_attr('head_dim_key')
         params['head_dim_value'] = node.get_attr('head_dim_value')
         params['feature_dim'] = node.get_attr('feature_dim')
         params['seq_len'] = node.get_attr('seq_len')
         params['config_mult_t1'] = 'config{}_1'.format(node.index)
-        mht_config = self.template.format(**params)
+        params['config_mult_t2'] = 'config{}_2'.format(node.index)
+        params['config_activ_t1'] = '{}_config{}'.format("softmax", node.index) ## not sure
+        params['strategy'] = node.get_attr('strategy')
+        mha_config = self.template.format(**params)
 
         mult_params1 = self._default_config_params(node)
+        mult_params1['mNum'] = '1'
         mult_params1['n_in'] = node.get_attr('feature_dim')
         mult_params1['n_out'] = node.get_attr('head_dim_key')
         mult_params1['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('query_weight').type.precision)
@@ -68,10 +87,26 @@ def format(self, node):
         mult_params1['index'] = str(node.index)
         mult_params1['nzeros'] = 0
         mult_params1['nonzeros'] = node.get_weights('query_weight').nonzeros
-
         mult_config1 = self.mult1_template.format(**mult_params1)
 
-        return mult_config1 + '\n' +mht_config
+        mult_params2 = self._default_config_params(node)
+        mult_params2['mNum'] = '2'
+        mult_params2['n_in'] = node.get_attr('head_dim_value') * node.get_attr('num_heads')
+        mult_params2['n_out'] = node.get_attr('feature_dim')
+        mult_params2['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('attention_output_weight').type.precision)
+        mult_params2['reuse'] = params['reuse']
+        mult_params2['index'] = str(node.index)
+        mult_params2['nzeros'] = 0
+        mult_params2['nonzeros'] = node.get_weights('attention_output_weight').nonzeros
+        mult_config2 = self.mult2_template.format(**mult_params2)
+
+        act_params = self._default_config_params(node)
+        act_params['n_in'] = node.get_attr('head_dim_key')
+        act_params['type'] = 'softmax'
+        act_params['implementation'] = 'latency'
+        act_config = self.activ1_template.format(**act_params)
+
+        return mult_config1 + '\n' + mult_config2 + '\n' + act_config + '\n' + mha_config
 
 class RecurrentFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index ffd9d84e43..bea6af9935 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -562,3 +562,18 @@ def init_garnet(self, layer):
     @layer_optimizer(GarNetStack)
     def init_garnet_stack(self, layer):
         self.init_garnet(layer)
+
+    @layer_optimizer(MultiHeadAttention)
+    def init_mha(self, layer):
+        # TODO Allow getting recurrent reuse factor from the config
+        reuse_factor = layer.model.config.get_reuse_factor(layer)
+        layer.set_attr('reuse_factor', reuse_factor)
+        index_t = IntegerPrecisionType(width=1, signed=False)
+        layer.set_attr('index_t', index_t)
+        if 'table_t' not in layer.attributes:
+            layer.set_attr('table_t', FixedPrecisionType(width=18, integer=8))
+        if 'table_size' not in layer.attributes:
+            layer.set_attr('table_size', 1024)
+        layer.set_attr('strategy', 'latency')
+
+
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index cc09feefc7..09f469ce7e 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1118,11 +1118,11 @@ def initialize(self):
 class LSTM(Layer):
     _expected_attributes = [
         Attribute('n_out'),
-        Attribute('activation', value_type=str),  ## not defined in the paser, when do we need this data
+        Attribute('activation', value_type=str),
         Attribute('recurrent_activation', value_type=str),
         Attribute('return_sequences', value_type=bool, default=False),
         Attribute('return_state', value_type=bool, default=False),
-        ChoiceAttribute('direction', ['forward', 'backward'], default='forward'), ## not defined in the paser, how can we know when to be backward
+        ChoiceAttribute('direction', ['forward', 'backward'], default='forward'),
         Attribute('time_major', value_type=bool, default=False),
         WeightAttribute('weight'),
         WeightAttribute('bias'),
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index f118d4750d..4bec0330f8 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -4,6 +4,7 @@
 #include "nnet_common.h"
 #include "nnet_mult.h"
 #include "nnet_dense.h"
+#include "nnet_activation.h"
 #include "hls_stream.h"
 #include <math.h>
 
@@ -33,6 +34,76 @@ struct multiheadattention_config
     using product = nnet::product::mult<x_T, y_T>;
 };
 
+template<class data_T, class res_T, typename CONFIG_T>
+void matrixmul_transpose(
+    data_T  Q[CONFIG_T::seq_len][CONFIG_T::head_dim_key], 
+    data_T  K[CONFIG_T::seq_len][CONFIG_T::head_dim_key], 
+    res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
+{
+    // #pragma HLS ARRAY RESHAPE variable=Q complete dim=2
+    // #pragma HLS ARRAY RESHAPE variable=K complete dim=1
+    const data_T dk = sqrt(CONFIG_T::head_dim_key);
+
+    // for each row and column of AB
+    row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+        col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
+            #pragma HLS PIPELINE II=1
+            // compute (QK)i,j
+            data_T QKij = 0;
+            product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
+                QKij += Q[i][k] * K[j][k];
+            }
+            QK[i][j] = QKij / dk;
+        }
+        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(QK[i], QK[i]); // can this two parameter be the same?
+    }
+}
+
+// template<class data_T, class res_T, typename CONFIG_T>
+// void matrixmul(
+//     data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], 
+//     data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value], 
+//     res_T   S[CONFIG_T::seq_len][CONFIG_T::head_dim_value]) // S: attention score
+// {
+//     #pragma HLS ARRAY RESHAPE variable=Q complete dim=2
+//     #pragma HLS ARRAY RESHAPE variable=K complete dim=1
+//     // for each row and column of AB
+//     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+//         col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
+//             #pragma HLS PIPELINE II=1
+//             // compute (S)i,j
+//             data_T Sij = 0;
+//             product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
+//                 Sij += Q[i][k] * K[k][j];
+//             }
+//             S[i][j] = Sij;
+//         }
+//     }
+// }
+
+template<class data_T, class res_T, typename CONFIG_T, class T>
+void matrixmul(
+    data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], 
+    data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value], 
+    res_T   S[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value],
+    T       head) // S: attention score
+{
+    #pragma HLS ARRAY RESHAPE variable=Q complete dim=2
+    #pragma HLS ARRAY RESHAPE variable=K complete dim=1
+    // for each row and column of AB
+    row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+        col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
+            #pragma HLS PIPELINE II=1
+            // compute (S)i,j
+            data_T Sij = 0;
+            product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
+                Sij += QK[i][k] * V[k][j];
+            }
+            S[i][CONFIG_T::head_dim_value*head+j] = Sij; // double check
+        }
+    }
+}
+
 template<class data_T, class res_T, typename CONFIG_T>
 void multiheadattention(
     data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
@@ -40,28 +111,61 @@ void multiheadattention(
     res_T     res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
     typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],  // num_heads,head_size_v,dim
     typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key],  // (dim,num_head,head_size)
+    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key],  // n_head,dim,head_dim
     typename CONFIG_T::bias_t    key_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
     typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], //same shape as key
     typename CONFIG_T::bias_t    query_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
     typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
-    typename CONFIG_T::accum_t q_proj[CONFIG_T::num_heads][CONFIG_T::seq_len * CONFIG_T::head_dim_key];
-    typename CONFIG_T::accum_t v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len * CONFIG_T::head_dim_value];
-    typename CONFIG_T::accum_t k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len * CONFIG_T::head_dim_key];
+    #pragma HLS ARRAY_PARTITION variable=res complete
+    #pragma HLS ARRAY_PARTITION variable=attention_output_weight complete
+    #pragma HLS ARRAY_PARTITION variable=attention_output_bias complete
+    #pragma HLS ARRAY_PARTITION variable=key_weight complete
+    #pragma HLS ARRAY_PARTITION variable=key_bias complete
+    #pragma HLS ARRAY_PARTITION variable=query_weight complete
+    #pragma HLS ARRAY_PARTITION variable=query_bias complete
+    #pragma HLS ARRAY_PARTITION variable=value_weight complete
+    #pragma HLS ARRAY_PARTITION variable=value_bias complete
+
+    data_T q_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    data_T v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
+    data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+
+    #pragma HLS ARRAY_PARTITION variable=q_proj type=complete dim=3
+    #pragma HLS ARRAY_PARTITION variable=v_proj type=complete dim=3
+    #pragma HLS ARRAY_PARTITION variable=k_proj type=complete dim=3
+    #pragma HLS ARRAY_PARTITION variable=qk_mul type=complete dim=3
 
     // linear projection
-    for (int i=0; i <= CONFIG_T::num_heads; i++)
-        {
-            for (int j=0; j <=CONFIG_T::seq_len; j++)
-            {
-            dense<data_T, res_T, typename CONFIG_T::mult_config1>(data_q,  q_proj[i]+(CONFIG_T::head_dim_key*j), query_weight+(CONFIG_T::head_dim_key * CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
-            dense<data_T, res_T, typename CONFIG_T::mult_config1>(data_vk, v_proj[i]+(CONFIG_T::head_dim_value*j), value_weight+(CONFIG_T::head_dim_value * CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
-            dense<data_T, res_T, typename CONFIG_T::mult_config1>(data_vk, k_proj[i]+(CONFIG_T::head_dim_key*j), key_weight+(CONFIG_T::head_dim_key * CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i));
-            }
+    seq: for (int j=0; j <=CONFIG_T::seq_len; ++j){
+        dense_for_each_head: for (int i=0; i <= CONFIG_T::num_heads; ++i){
+            #pragma HLS UNROLL
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[i][j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
         }
+    }
+
+    data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
+    // matrix_mult: for (int i=0; i <= CONFIG_T::num_heads; ++i){
+    //     #pragma HLS UNROLL
+    //     nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+    //     nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], q_proj[i]); // reusing q_proj, storing attention score
+    // }
+    matrix_mult: for (int i=0; i <= CONFIG_T::num_heads; ++i){
+        #pragma HLS UNROLL
+        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+        nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
+    }
+
+    output_dense: for (int j=0; j <=CONFIG_T::seq_len; ++j){ 
+        dense<data_T, res_T, typename CONFIG_T::config_mult2>(dense_in[j], res+(CONFIG_T::feature_dim*j), attention_output_weight, attention_output_bias);
+    }
 }
+
+
 }
 
 #endif

From 3961f97d044ee4131ed8763ded7e824b6d6df908 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Wed, 10 Aug 2022 18:48:48 -0700
Subject: [PATCH 13/55] able to predict model correct

---
 .../vivado/passes/transformer_templates.py    | 12 ++--
 hls4ml/backends/vivado/vivado_backend.py      |  4 +-
 .../nnet_utils/nnet_multiheadattention.h      | 66 +++++++------------
 3 files changed, 33 insertions(+), 49 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 3ab86a56e9..078f2e3e20 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -74,7 +74,7 @@ def format(self, node):
         params['seq_len'] = node.get_attr('seq_len')
         params['config_mult_t1'] = 'config{}_1'.format(node.index)
         params['config_mult_t2'] = 'config{}_2'.format(node.index)
-        params['config_activ_t1'] = '{}_config{}'.format("softmax", node.index) ## not sure
+        params['config_activ_t1'] = '{}_config{}'.format("softmax", node.index)
         params['strategy'] = node.get_attr('strategy')
         mha_config = self.template.format(**params)
 
@@ -86,7 +86,7 @@ def format(self, node):
         mult_params1['reuse'] = params['reuse']
         mult_params1['index'] = str(node.index)
         mult_params1['nzeros'] = 0
-        mult_params1['nonzeros'] = node.get_weights('query_weight').nonzeros
+        mult_params1['nonzeros'] = params['feature_dim']*params['num_heads']*params['head_dim_key']
         mult_config1 = self.mult1_template.format(**mult_params1)
 
         mult_params2 = self._default_config_params(node)
@@ -97,18 +97,18 @@ def format(self, node):
         mult_params2['reuse'] = params['reuse']
         mult_params2['index'] = str(node.index)
         mult_params2['nzeros'] = 0
-        mult_params2['nonzeros'] = node.get_weights('attention_output_weight').nonzeros
+        mult_params2['nonzeros'] = params['feature_dim']*params['num_heads']*params['head_dim_key']
         mult_config2 = self.mult2_template.format(**mult_params2)
 
         act_params = self._default_config_params(node)
-        act_params['n_in'] = node.get_attr('head_dim_key')
+        act_params['n_in'] = node.get_attr('seq_len')
         act_params['type'] = 'softmax'
-        act_params['implementation'] = 'latency'
+        act_params['implementation'] = 'legacy' #latency,stable not work
         act_config = self.activ1_template.format(**act_params)
 
         return mult_config1 + '\n' + mult_config2 + '\n' + act_config + '\n' + mha_config
 
-class RecurrentFunctionTemplate(FunctionCallTemplate):
+class MhaFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(MultiHeadAttention, include_header=mha_include_list)
         self.template = mha_function_template
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index bea6af9935..e7a7d552f5 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -571,9 +571,9 @@ def init_mha(self, layer):
         index_t = IntegerPrecisionType(width=1, signed=False)
         layer.set_attr('index_t', index_t)
         if 'table_t' not in layer.attributes:
-            layer.set_attr('table_t', FixedPrecisionType(width=18, integer=8))
+            layer.set_attr('table_t', FixedPrecisionType(width=32, integer=5))
         if 'table_size' not in layer.attributes:
-            layer.set_attr('table_size', 1024)
+            layer.set_attr('table_size', 4096)
         layer.set_attr('strategy', 'latency')
 
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 4bec0330f8..d099ce436f 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -6,6 +6,7 @@
 #include "nnet_dense.h"
 #include "nnet_activation.h"
 #include "hls_stream.h"
+#include <iostream>
 #include <math.h>
 
 namespace nnet {
@@ -15,7 +16,7 @@ struct multiheadattention_config
     // Internal data type definitions
     typedef float bias_t;
     typedef float weight_t;
-    typedef float accum_t;  // where this type will be used
+    typedef float accum_t;
 
     // Layer Sizes
     static const unsigned num_heads = 10;
@@ -24,7 +25,7 @@ struct multiheadattention_config
     static const unsigned feature_dim = 20;
     static const unsigned seq_len = 500;
 
-    // Resource reuse info  // not sure how to write this part
+    // Resource reuse info
     static const unsigned io_type = io_parallel;
     static const unsigned strategy = latency; 
     static const unsigned reuse_factor = 1;
@@ -44,6 +45,8 @@ void matrixmul_transpose(
     // #pragma HLS ARRAY RESHAPE variable=K complete dim=1
     const data_T dk = sqrt(CONFIG_T::head_dim_key);
 
+    data_T Product[CONFIG_T::seq_len][CONFIG_T::seq_len];
+    
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
         col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
@@ -51,35 +54,19 @@ void matrixmul_transpose(
             // compute (QK)i,j
             data_T QKij = 0;
             product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-                QKij += Q[i][k] * K[j][k];
+                QKij += Q[i][k]* K[j][k];
             }
-            QK[i][j] = QKij / dk;
+            Product[i][j] = QKij / dk ;
         }
-        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(QK[i], QK[i]); // can this two parameter be the same?
+        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product[i], QK[i]); // can this two parameters be the same?
+        // test: for (int k = 0; k < CONFIG_T::seq_len-1; ++k) {
+        //     QK[i][k]=QK[i][1];
+        // }
+        // nnet::print_result<result_t, CONFIG_T::seq_len>(Product[i], std::cout);
+        // nnet::print_result<result_t, CONFIG_T::seq_len>(QK[i], std::cout);
     }
 }
 
-// template<class data_T, class res_T, typename CONFIG_T>
-// void matrixmul(
-//     data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], 
-//     data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value], 
-//     res_T   S[CONFIG_T::seq_len][CONFIG_T::head_dim_value]) // S: attention score
-// {
-//     #pragma HLS ARRAY RESHAPE variable=Q complete dim=2
-//     #pragma HLS ARRAY RESHAPE variable=K complete dim=1
-//     // for each row and column of AB
-//     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
-//         col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-//             #pragma HLS PIPELINE II=1
-//             // compute (S)i,j
-//             data_T Sij = 0;
-//             product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
-//                 Sij += Q[i][k] * K[k][j];
-//             }
-//             S[i][j] = Sij;
-//         }
-//     }
-// }
 
 template<class data_T, class res_T, typename CONFIG_T, class T>
 void matrixmul(
@@ -88,8 +75,8 @@ void matrixmul(
     res_T   S[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value],
     T       head) // S: attention score
 {
-    #pragma HLS ARRAY RESHAPE variable=Q complete dim=2
-    #pragma HLS ARRAY RESHAPE variable=K complete dim=1
+    #pragma HLS ARRAY RESHAPE variable=QK complete dim=2
+    #pragma HLS ARRAY RESHAPE variable=V complete dim=1
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
         col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
@@ -133,14 +120,14 @@ void multiheadattention(
     data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
     data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
 
-    #pragma HLS ARRAY_PARTITION variable=q_proj type=complete dim=3
-    #pragma HLS ARRAY_PARTITION variable=v_proj type=complete dim=3
-    #pragma HLS ARRAY_PARTITION variable=k_proj type=complete dim=3
-    #pragma HLS ARRAY_PARTITION variable=qk_mul type=complete dim=3
+    // #pragma HLS ARRAY_PARTITION variable=q_proj type=complete dim=3
+    // #pragma HLS ARRAY_PARTITION variable=v_proj type=complete dim=3
+    // #pragma HLS ARRAY_PARTITION variable=k_proj type=complete dim=3
+    // #pragma HLS ARRAY_PARTITION variable=qk_mul type=complete dim=3
 
     // linear projection
-    seq: for (int j=0; j <=CONFIG_T::seq_len; ++j){
-        dense_for_each_head: for (int i=0; i <= CONFIG_T::num_heads; ++i){
+    seq: for (int j=0; j <CONFIG_T::seq_len; ++j){
+        dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
             #pragma HLS UNROLL
             dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[i][j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
             dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
@@ -149,18 +136,15 @@ void multiheadattention(
     }
 
     data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
-    // matrix_mult: for (int i=0; i <= CONFIG_T::num_heads; ++i){
-    //     #pragma HLS UNROLL
-    //     nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
-    //     nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], q_proj[i]); // reusing q_proj, storing attention score
-    // }
-    matrix_mult: for (int i=0; i <= CONFIG_T::num_heads; ++i){
+
+    matrix_mult: for (int i=0; i < CONFIG_T::num_heads; ++i){
         #pragma HLS UNROLL
         nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
         nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
     }
 
-    output_dense: for (int j=0; j <=CONFIG_T::seq_len; ++j){ 
+    output_dense: for (int j=0; j <CONFIG_T::seq_len; ++j){ 
+        // nnet::print_result<result_t, CONFIG_T::num_heads*CONFIG_T::head_dim_value>(dense_in[j], std::cout);
         dense<data_T, res_T, typename CONFIG_T::config_mult2>(dense_in[j], res+(CONFIG_T::feature_dim*j), attention_output_weight, attention_output_bias);
     }
 }

From 3533999d8bb019bf5bcd36693c54c502c8a1c7fe Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Thu, 11 Aug 2022 09:29:23 -0700
Subject: [PATCH 14/55] delete some unnassary comments

---
 hls4ml/converters/keras_to_hls.py | 4 +---
 hls4ml/model/layers.py            | 7 +------
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 433d095598..a4ba32f5e2 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -283,12 +283,10 @@ def parse_keras_model(model_arch, reader):
 
         # Extract inbound nodes
         if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
-            input_names = [ inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0] ] # why using inputs_map.get?
+            input_names = [ inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0] ]
             if keras_layer['inbound_nodes'][0][0][-1]: # multi_head_attention  has inbound: [[['input_3', 0, 0, {'value': ['dense_3', 0, 0]}]]]
                 inputname2 = list(keras_layer['inbound_nodes'][0][0][-1].values())
                 input_names+=[inp[0] for inp in inputname2]
-            # print("input_names: ", input_names)
-
         else:
             input_names = None
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 09f469ce7e..ff297088a9 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1071,7 +1071,7 @@ def initialize(self):
 class SimpleRNN(Layer):
     _expected_attributes = [
         Attribute('n_out'),
-        Attribute('activation', value_type=str), 
+        Attribute('activation', value_type=str),
         Attribute('return_sequences', value_type=bool, default=False),
         Attribute('return_state', value_type=bool, default=False),
         ChoiceAttribute('direction', ['forward', 'backward'], default='forward'),
@@ -1441,11 +1441,6 @@ class MultiHeadAttention(Layer):
     ]
 
     def initialize(self):
-
-        # is these two needed?
-        # self.add_weights()
-        # self.add_bias()
-
         weights_source = [
                 ('attention_output', 'kernel'),
                 ('attention_output', 'bias'),

From d2f0df6cc613d0a47b2ded81c35039d2b2c9b198 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Thu, 11 Aug 2022 09:31:52 -0700
Subject: [PATCH 15/55] delete comments

---
 hls4ml/converters/keras_to_hls.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index a4ba32f5e2..c501a31d95 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -337,10 +337,5 @@ def keras_to_hls(config):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
     print('Creating HLS model')
-                            # config, a dict, has key: 'HLSConfig' and 'KerasModel'
-                            # reader can read all the weight and bias
-                            # layer_list a list of dict, each element is one layer output from parser
-                            # input_layers, a list
-                            # output layers, a list
     hls_model = ModelGraph(config, reader, layer_list, input_layers, output_layers)
     return hls_model

From 6aaa5ede9eef39750ffb5119df396a768ed43b1f Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Fri, 16 Sep 2022 11:47:12 -0700
Subject: [PATCH 16/55] resource strategy of transformer

---
 .../vivado/passes/transformer_templates.py    |   2 +-
 hls4ml/backends/vivado/vivado_backend.py      |  16 ++-
 hls4ml/model/layers.py                        |   3 +
 .../vivado/nnet_utils/nnet_activation.h       |   2 +-
 .../vivado/nnet_utils/nnet_dense_latency.h    |   2 +-
 .../nnet_utils/nnet_multiheadattention.h      | 101 +++++++++++-------
 6 files changed, 80 insertions(+), 46 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 078f2e3e20..6ec0baefe0 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -103,7 +103,7 @@ def format(self, node):
         act_params = self._default_config_params(node)
         act_params['n_in'] = node.get_attr('seq_len')
         act_params['type'] = 'softmax'
-        act_params['implementation'] = 'legacy' #latency,stable not work
+        act_params['implementation'] = 'legacy' #latency,stable not work， legacy works
         act_config = self.activ1_template.format(**act_params)
 
         return mult_config1 + '\n' + mult_config2 + '\n' + act_config + '\n' + mha_config
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index e7a7d552f5..a2f96ecb3e 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -493,6 +493,18 @@ def init_pooling2d(self, layer):
 
     @layer_optimizer(Softmax)
     def init_softmax(self, layer):
+        if 'exp_table_t' not in layer.attributes:
+            layer.set_attr('exp_table_t', layer.get_attr('table_t'))
+        if 'inv_table_t' not in layer.attributes:
+            layer.set_attr('inv_table_t', layer.get_attr('table_t'))
+        if layer.model.config.is_resource_strategy(layer):
+            # 'resource' strategy = 'latency' for Softmax
+            layer.set_attr('implementation', 'legacy') # latency legacy stable
+            # layer.set_attr('implementation', 'latency')
+        else:
+            # layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
+            layer.set_attr('implementation', 'legacy') # latency legacy stable
+
         if layer.model.config.get_config_value('IOType') == 'io_parallel':
             assert (
                 len(layer.get_input_variable().shape) == 1
@@ -573,7 +585,7 @@ def init_mha(self, layer):
         if 'table_t' not in layer.attributes:
             layer.set_attr('table_t', FixedPrecisionType(width=32, integer=5))
         if 'table_size' not in layer.attributes:
-            layer.set_attr('table_size', 4096)
-        layer.set_attr('strategy', 'latency')
+            layer.set_attr('table_size', 2048)
+        layer.set_attr('strategy', 'resource')  #latency
 
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index ff297088a9..6ad68f13b7 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1458,6 +1458,9 @@ def initialize(self):
                 vtype = 'weight'
                 if lname in ['key', 'query', 'value']:
                     data = data.transpose((1, 0, 2))
+                    data = data.transpose((0, 2, 1)) ###
+                if lname in ['attention_output']:
+                    data = data.transpose((2,0,1)) ###
             else:
                 vtype = 'bias'
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index da13998e38..15c9706410 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -161,7 +161,7 @@ void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_
     // The template data_T is the data type used to address the table
     for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
         float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
-        typename CONFIG_T::inv_table_t inv_x = 1 / x;
+        typename CONFIG_T::inv_table_t inv_x = 1.0 / x;
         table_out[i] = inv_x;
     }
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
index 02802c45a9..3215eeb4c5 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
@@ -25,7 +25,7 @@ void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
     //   - if we have an unroll factor, limit number of multipliers
     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
 
-    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
     #pragma HLS ARRAY_PARTITION variable=biases complete
     #pragma HLS ARRAY_PARTITION variable=mult complete
     #pragma HLS ARRAY_PARTITION variable=acc complete
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index d099ce436f..cd3c7ca027 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -41,29 +41,31 @@ void matrixmul_transpose(
     data_T  K[CONFIG_T::seq_len][CONFIG_T::head_dim_key], 
     res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
-    // #pragma HLS ARRAY RESHAPE variable=Q complete dim=2
-    // #pragma HLS ARRAY RESHAPE variable=K complete dim=1
     const data_T dk = sqrt(CONFIG_T::head_dim_key);
-
-    data_T Product[CONFIG_T::seq_len][CONFIG_T::seq_len];
-    
+    data_T QKij;
+    data_T Product[CONFIG_T::seq_len];
+#pragma HLS ARRAY_PARTITION variable=K complete dim=2
+#pragma HLS ARRAY_PARTITION variable=Q complete dim=2
+#pragma HLS ARRAY_PARTITION variable=Product complete
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+        #pragma HLS unroll factor=1
         col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
-            #pragma HLS PIPELINE II=1
+#pragma HLS PIPELINE
             // compute (QK)i,j
-            data_T QKij = 0;
+            QKij = 0;
             product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
                 QKij += Q[i][k]* K[j][k];
             }
-            Product[i][j] = QKij / dk ;
+            Product[j] = QKij/dk;
         }
-        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product[i], QK[i]); // can this two parameters be the same?
-        // test: for (int k = 0; k < CONFIG_T::seq_len-1; ++k) {
-        //     QK[i][k]=QK[i][1];
-        // }
-        // nnet::print_result<result_t, CONFIG_T::seq_len>(Product[i], std::cout);
-        // nnet::print_result<result_t, CONFIG_T::seq_len>(QK[i], std::cout);
+        // std::cout << "input to softmax: " << std::endl;
+        // nnet::print_result<result_t, CONFIG_T::seq_len>(Product, std::cout);
+
+        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, QK[i]);
+        
+        // std::cout << "output from softmax: " << std::endl;
+        // nnet::print_result<result_t, CONFIG_T::seq_len>( QK[i], std::cout);
     }
 }
 
@@ -75,18 +77,20 @@ void matrixmul(
     res_T   S[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value],
     T       head) // S: attention score
 {
-    #pragma HLS ARRAY RESHAPE variable=QK complete dim=2
+	#pragma HLS ARRAY RESHAPE variable=QK complete dim=2
     #pragma HLS ARRAY RESHAPE variable=V complete dim=1
     // for each row and column of AB
+    data_T Sij;
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+        #pragma HLS unroll factor=1
         col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-            #pragma HLS PIPELINE II=1
+#pragma HLS PIPELINE
             // compute (S)i,j
-            data_T Sij = 0;
+            Sij = 0;
             product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
                 Sij += QK[i][k] * V[k][j];
             }
-            S[i][CONFIG_T::head_dim_value*head+j] = Sij; // double check
+            S[i][CONFIG_T::head_dim_value*head+j] = Sij;
         }
     }
 }
@@ -105,51 +109,66 @@ void multiheadattention(
     typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
-    #pragma HLS ARRAY_PARTITION variable=res complete
-    #pragma HLS ARRAY_PARTITION variable=attention_output_weight complete
-    #pragma HLS ARRAY_PARTITION variable=attention_output_bias complete
-    #pragma HLS ARRAY_PARTITION variable=key_weight complete
-    #pragma HLS ARRAY_PARTITION variable=key_bias complete
-    #pragma HLS ARRAY_PARTITION variable=query_weight complete
-    #pragma HLS ARRAY_PARTITION variable=query_bias complete
-    #pragma HLS ARRAY_PARTITION variable=value_weight complete
-    #pragma HLS ARRAY_PARTITION variable=value_bias complete
 
     data_T q_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
     data_T v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
     data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
     data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+    data_T qk_mul_sm[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+
+    data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
+#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=0
+
+#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1  // partition the 1-dim
+//#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+//#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
 
-    // #pragma HLS ARRAY_PARTITION variable=q_proj type=complete dim=3
-    // #pragma HLS ARRAY_PARTITION variable=v_proj type=complete dim=3
-    // #pragma HLS ARRAY_PARTITION variable=k_proj type=complete dim=3
-    // #pragma HLS ARRAY_PARTITION variable=qk_mul type=complete dim=3
+// #pragma HLS ARRAY_RESHAPE variable=q_proj complete dim=3
+// #pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=3
+// #pragma HLS ARRAY_RESHAPE variable=k_proj complete dim=3
+// #pragma HLS ARRAY_RESHAPE variable=qk_mul complete dim=3
 
     // linear projection
-    seq: for (int j=0; j <CONFIG_T::seq_len; ++j){
-        dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
-            #pragma HLS UNROLL
+    dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
+#pragma HLS DATAFLOW
+// or #pragma HLS unroll // less BRAM slower
+        // seq: for (int j=0; j <CONFIG_T::seq_len; ++j){
+        //     dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[i][j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+        //     dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+        //     dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
+        // }
+        // std::cout << "input to q of head "<< i << std::endl;
+        seq1: for (int j=0; j <CONFIG_T::seq_len; ++j){
+            // nnet::print_result<result_t, CONFIG_T::feature_dim>(data_q +(CONFIG_T::feature_dim*j), std::cout);
             dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[i][j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+        }
+        
+        seq2: for (int j=0; j <CONFIG_T::seq_len; ++j){
             dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
         }
-    }
 
-    data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
+        seq3: for (int j=0; j <CONFIG_T::seq_len; ++j){
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
+        }
 
-    matrix_mult: for (int i=0; i < CONFIG_T::num_heads; ++i){
-        #pragma HLS UNROLL
         nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
         nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
     }
 
+
+
+//     matrix_mult: for (int i=0; i < CONFIG_T::num_heads; ++i){
+// // #pragma HLS DEPENDENCE variable=qk_mul inter false
+//         nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+//         nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
+//     }
+
     output_dense: for (int j=0; j <CONFIG_T::seq_len; ++j){ 
         // nnet::print_result<result_t, CONFIG_T::num_heads*CONFIG_T::head_dim_value>(dense_in[j], std::cout);
         dense<data_T, res_T, typename CONFIG_T::config_mult2>(dense_in[j], res+(CONFIG_T::feature_dim*j), attention_output_weight, attention_output_bias);
     }
 }
-
-
 }
 
 #endif

From 3b7a288f2827e7a3b3411f6b4a2fd4ac4fe59d16 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Sat, 1 Oct 2022 16:46:01 -0700
Subject: [PATCH 17/55] change sm lagacy

---
 hls4ml/backends/vivado/vivado_backend.py      |  3 +-
 .../vivado/nnet_utils/nnet_activation.h       | 86 ++++++++++++++++++-
 .../nnet_utils/nnet_multiheadattention.h      | 54 +++++-------
 3 files changed, 107 insertions(+), 36 deletions(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index a2f96ecb3e..298218734f 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -499,8 +499,9 @@ def init_softmax(self, layer):
             layer.set_attr('inv_table_t', layer.get_attr('table_t'))
         if layer.model.config.is_resource_strategy(layer):
             # 'resource' strategy = 'latency' for Softmax
-            layer.set_attr('implementation', 'legacy') # latency legacy stable
             # layer.set_attr('implementation', 'latency')
+            layer.set_attr('implementation', 'legacy') # latency legacy stable
+            
         else:
             # layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
             layer.set_attr('implementation', 'legacy') # latency legacy stable
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 15c9706410..9426f826b0 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -294,6 +294,88 @@ template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename
     }
 }
 
+// template<class data_T, class res_T, typename CONFIG_T>
+// void  softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
+// {
+//     int exp_range = 4;
+//     int inv_range = 16;
+//     // Initialize the lookup table
+// #ifdef __HLS_SYN__
+//     bool initialized = false;
+//     typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+//     typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+// #else
+//     static bool initialized = false;
+//     static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+//     static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+// #endif
+//     if (!initialized) {
+//         init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+//         init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+//         initialized = true;
+//     }
+
+    
+
+//     // Index into the lookup table based on data for exponentials
+//     typename CONFIG_T::table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision
+//     typename CONFIG_T::table_t exp_diff_res;// different, independent, fixed point precision
+//     data_T data_cache[CONFIG_T::n_in];
+//     int data_round;
+//     int index;
+//     // for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+//     //     data_cache[ii] = data[ii];
+//     //     exp_res[ii] = 0;
+//     // }
+
+//     // for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+//     //     for (int jj=0; jj<CONFIG_T::n_in; jj++) {
+//     //         if (ii==jj) exp_diff_res = 1;
+//     //         else {
+//     //             data_round = (data_cache[jj]-data_cache[ii])*CONFIG_T::table_size/(exp_range*2);
+//     //             index = data_round + exp_range*CONFIG_T::table_size/(exp_range*2);
+//     //             if (index < 0)   index = 0;
+//     //             if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+//     //             exp_diff_res = exp_table[index];
+//     //         }
+//     //         exp_res[ii] += exp_diff_res;
+//     //     }
+//     // }
+
+//     // data_T one = 1.0;
+//     // //Second loop to invert
+//     // for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+//     //     int exp_res_index = exp_res[ii]*CONFIG_T::table_size/inv_range;
+//     //     if (exp_res_index < 0)   exp_res_index = 0;
+//     //     if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
+//     //     //typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
+//     //     res[ii] = (res_T) invert_table[exp_res_index];
+//     // }
+
+//     for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+//         #pragma HLS PIPELINE
+//         exp_res[ii] = 0;
+//         for (int jj=0; jj<CONFIG_T::n_in; jj++) {
+//             // #pragma HLS UNROLL
+//             if (ii==jj) exp_diff_res = 1;
+//             else {
+//                 data_round = (data[jj]-data[ii])*CONFIG_T::table_size/(exp_range*2);
+//                 index = data_round + exp_range*CONFIG_T::table_size/(exp_range*2);
+//                 if (index < 0)   index = 0;
+//                 if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+//                 exp_diff_res = exp_table[index];
+//             }
+//             exp_res[ii] += exp_diff_res;
+//         }
+
+//         int exp_res_index = exp_res[ii]*CONFIG_T::table_size/inv_range;
+//         if (exp_res_index < 0)   exp_res_index = 0;
+//         if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
+//         res[ii] = (res_T) invert_table[exp_res_index];
+//     }
+
+// }
+
 template <class data_T, class res_T, typename CONFIG_T>
 void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     // Initialize the lookup table
@@ -311,9 +393,7 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
         init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
         initialized = true;
     }
-
-    #pragma HLS PIPELINE
-
+    
     // Index into the lookup table based on data for exponentials
     typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
     typename CONFIG_T::table_t exp_diff_res;            // different, independent, fixed point precision
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index cd3c7ca027..d695b8fd28 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -41,29 +41,35 @@ void matrixmul_transpose(
     data_T  K[CONFIG_T::seq_len][CONFIG_T::head_dim_key], 
     res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
-    const data_T dk = sqrt(CONFIG_T::head_dim_key);
+    const data_T dk = 1.0/sqrt(CONFIG_T::head_dim_key);
     data_T QKij;
     data_T Product[CONFIG_T::seq_len];
-#pragma HLS ARRAY_PARTITION variable=K complete dim=2
-#pragma HLS ARRAY_PARTITION variable=Q complete dim=2
+#pragma HLS ARRAY_RESHAPE variable=K cyclic factor=2 dim=1
+#pragma HLS ARRAY_RESHAPE variable=Q cyclic factor=2 dim=1
+#pragma HLS ARRAY_RESHAPE variable=K complete dim=2
+#pragma HLS ARRAY_RESHAPE variable=Q complete dim=2
 #pragma HLS ARRAY_PARTITION variable=Product complete
+
+#pragma HLS ARRAY_Partition variable=QK complete dim=2
+
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
-        #pragma HLS unroll factor=1
         col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
 #pragma HLS PIPELINE
+#pragma HLS UNROLL factor=4
             // compute (QK)i,j
             QKij = 0;
             product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-                QKij += Q[i][k]* K[j][k];
+                QKij += Q[i][k] * K[j][k];
             }
-            Product[j] = QKij/dk;
+            Product[j] = QKij * dk;
+//            QK[i][j] = QKij * dk;
         }
         // std::cout << "input to softmax: " << std::endl;
         // nnet::print_result<result_t, CONFIG_T::seq_len>(Product, std::cout);
 
         softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, QK[i]);
-        
+
         // std::cout << "output from softmax: " << std::endl;
         // nnet::print_result<result_t, CONFIG_T::seq_len>( QK[i], std::cout);
     }
@@ -77,12 +83,11 @@ void matrixmul(
     res_T   S[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value],
     T       head) // S: attention score
 {
-	#pragma HLS ARRAY RESHAPE variable=QK complete dim=2
-    #pragma HLS ARRAY RESHAPE variable=V complete dim=1
+	#pragma HLS ARRAY_Partition variable=QK complete dim=2
+    #pragma HLS ARRAY_RESHAPE variable=V complete dim=1
     // for each row and column of AB
     data_T Sij;
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
-        #pragma HLS unroll factor=1
         col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
 #pragma HLS PIPELINE
             // compute (S)i,j
@@ -120,50 +125,35 @@ void multiheadattention(
 #pragma HLS ARRAY_PARTITION variable=dense_in complete dim=0
 
 #pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1  // partition the 1-dim
-//#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
 #pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
 //#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
 
-// #pragma HLS ARRAY_RESHAPE variable=q_proj complete dim=3
-// #pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=3
-// #pragma HLS ARRAY_RESHAPE variable=k_proj complete dim=3
-// #pragma HLS ARRAY_RESHAPE variable=qk_mul complete dim=3
+
 
     // linear projection
     dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
 #pragma HLS DATAFLOW
 // or #pragma HLS unroll // less BRAM slower
-        // seq: for (int j=0; j <CONFIG_T::seq_len; ++j){
-        //     dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[i][j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
-        //     dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
-        //     dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
-        // }
-        // std::cout << "input to q of head "<< i << std::endl;
+
         seq1: for (int j=0; j <CONFIG_T::seq_len; ++j){
             // nnet::print_result<result_t, CONFIG_T::feature_dim>(data_q +(CONFIG_T::feature_dim*j), std::cout);
             dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[i][j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
         }
         
         seq2: for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
         }
 
+        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+
         seq3: for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
         }
 
-        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
         nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
     }
 
-
-
-//     matrix_mult: for (int i=0; i < CONFIG_T::num_heads; ++i){
-// // #pragma HLS DEPENDENCE variable=qk_mul inter false
-//         nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
-//         nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
-//     }
-
     output_dense: for (int j=0; j <CONFIG_T::seq_len; ++j){ 
         // nnet::print_result<result_t, CONFIG_T::num_heads*CONFIG_T::head_dim_value>(dense_in[j], std::cout);
         dense<data_T, res_T, typename CONFIG_T::config_mult2>(dense_in[j], res+(CONFIG_T::feature_dim*j), attention_output_weight, attention_output_bias);

From 130092dae08163000540fad5b412449ac9acab7c Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Tue, 11 Oct 2022 22:08:41 -0700
Subject: [PATCH 18/55] update MHA, optimized

---
 .../nnet_utils/nnet_multiheadattention.h      | 52 +++++++------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index d695b8fd28..31b9d399ed 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -50,7 +50,7 @@ void matrixmul_transpose(
 #pragma HLS ARRAY_RESHAPE variable=Q complete dim=2
 #pragma HLS ARRAY_PARTITION variable=Product complete
 
-#pragma HLS ARRAY_Partition variable=QK complete dim=2
+#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
 
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
@@ -63,15 +63,8 @@ void matrixmul_transpose(
                 QKij += Q[i][k] * K[j][k];
             }
             Product[j] = QKij * dk;
-//            QK[i][j] = QKij * dk;
         }
-        // std::cout << "input to softmax: " << std::endl;
-        // nnet::print_result<result_t, CONFIG_T::seq_len>(Product, std::cout);
-
         softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, QK[i]);
-
-        // std::cout << "output from softmax: " << std::endl;
-        // nnet::print_result<result_t, CONFIG_T::seq_len>( QK[i], std::cout);
     }
 }
 
@@ -115,47 +108,40 @@ void multiheadattention(
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
 
-    data_T q_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    data_T v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
-    data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
-    data_T qk_mul_sm[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+    data_T q_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    data_T v_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
+    data_T v_row[CONFIG_T::head_dim_value];
+    data_T k_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    data_T qk_mul[CONFIG_T::seq_len][CONFIG_T::seq_len];
 
     data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
-#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=0
-
-#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1  // partition the 1-dim
-#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
-#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
-//#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
-
-
+#pragma HLS ARRAY_PARTITION variable=v_row complete dim=0
+#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=2
+#pragma HLS ARRAY_RESHAPE variable=v_proj2 complete dim=1
 
     // linear projection
     dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
 #pragma HLS DATAFLOW
-// or #pragma HLS unroll // less BRAM slower
+    	seq3: for (int j=0; j <CONFIG_T::seq_len; ++j){
+			dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_row, value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+			for (int k=0; k <CONFIG_T::head_dim_value; ++k){
+				v_proj[j][k]=v_row[k];
+			}
+		}
 
         seq1: for (int j=0; j <CONFIG_T::seq_len; ++j){
-            // nnet::print_result<result_t, CONFIG_T::feature_dim>(data_q +(CONFIG_T::feature_dim*j), std::cout);
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[i][j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
         }
         
         seq2: for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[i][j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
-        }
-
-        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
-
-        seq3: for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_proj[i][j], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
         }
 
-        nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
+        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj, k_proj, qk_mul);
+        nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul, v_proj, dense_in, i);
     }
 
     output_dense: for (int j=0; j <CONFIG_T::seq_len; ++j){ 
-        // nnet::print_result<result_t, CONFIG_T::num_heads*CONFIG_T::head_dim_value>(dense_in[j], std::cout);
         dense<data_T, res_T, typename CONFIG_T::config_mult2>(dense_in[j], res+(CONFIG_T::feature_dim*j), attention_output_weight, attention_output_bias);
     }
 }

From 09b0ba0681d2236a9f01c7921b57c3b987d0b44e Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Sun, 23 Oct 2022 13:09:07 -0700
Subject: [PATCH 19/55] support resource

---
 hls4ml/backends/vivado/passes/resource_strategy.py        | 8 ++++++--
 hls4ml/model/layers.py                                    | 3 ---
 .../templates/vivado/nnet_utils/nnet_multiheadattention.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py
index 63e6e0b4db..ab8a7a361b 100644
--- a/hls4ml/backends/vivado/passes/resource_strategy.py
+++ b/hls4ml/backends/vivado/passes/resource_strategy.py
@@ -1,9 +1,8 @@
 import numpy as np
 
-from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SeparableConv1D, SeparableConv2D
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SeparableConv1D, SeparableConv2D, MultiHeadAttention
 from hls4ml.model.optimizer import OptimizerPass
 
-
 class ApplyResourceStrategy(OptimizerPass):
     '''Transposes the weights to use the dense_resource matrix multiply routine'''
 
@@ -40,6 +39,11 @@ def transform(self, model, node):
         elif isinstance(node, (LSTM, GRU)):
             node.weights['weight'].data = np.transpose(node.weights['weight'].data)
             node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
+        elif isinstance(node, (MultiHeadAttention)):               
+            node.weights['key_weight'].data   = np.transpose(node.weights['key_weight'].data,   axes=[0, 2, 1])
+            node.weights['query_weight'].data = np.transpose(node.weights['query_weight'].data, axes=[0, 2, 1])
+            node.weights['value_weight'].data = np.transpose(node.weights['value_weight'].data, axes=[0, 2, 1])
+            node.weights['attention_output_weight'].data = np.transpose(node.weights['attention_output_weight'].data, axes=[2, 0, 1])
         else:
             raise Exception(f'Unexpected layer {node.class_name} with resource strategy')
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 6ad68f13b7..ff297088a9 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1458,9 +1458,6 @@ def initialize(self):
                 vtype = 'weight'
                 if lname in ['key', 'query', 'value']:
                     data = data.transpose((1, 0, 2))
-                    data = data.transpose((0, 2, 1)) ###
-                if lname in ['attention_output']:
-                    data = data.transpose((2,0,1)) ###
             else:
                 vtype = 'bias'
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 31b9d399ed..6bb70220de 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -117,7 +117,7 @@ void multiheadattention(
     data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
 #pragma HLS ARRAY_PARTITION variable=v_row complete dim=0
 #pragma HLS ARRAY_PARTITION variable=dense_in complete dim=2
-#pragma HLS ARRAY_RESHAPE variable=v_proj2 complete dim=1
+#pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=1
 
     // linear projection
     dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){

From b49fffd82eda733f937976f054b3c3e72d4ed8ee Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Sat, 26 Nov 2022 18:29:58 -0800
Subject: [PATCH 20/55] update

---
 .../vivado/passes/resource_strategy.py        |  3 +-
 hls4ml/model/layers.py                        |  3 +
 .../vivado/nnet_utils/nnet_activation.h       | 81 -------------------
 .../nnet_utils/nnet_multiheadattention.h      |  7 ++
 hs_err_pid6927.log                            | 17 ++++
 5 files changed, 29 insertions(+), 82 deletions(-)
 create mode 100644 hs_err_pid6927.log

diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py
index ab8a7a361b..b1c28bcfbc 100644
--- a/hls4ml/backends/vivado/passes/resource_strategy.py
+++ b/hls4ml/backends/vivado/passes/resource_strategy.py
@@ -7,7 +7,8 @@ class ApplyResourceStrategy(OptimizerPass):
     '''Transposes the weights to use the dense_resource matrix multiply routine'''
 
     def match(self, node):
-        node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU))
+        
+        node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU, MultiHeadAttention))
         is_resource_strategy = node.get_attr('strategy', '').lower() == 'resource'
         already_transformed = node.get_attr('_weights_transposed', False) is True
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index ff297088a9..817a3256be 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1458,6 +1458,9 @@ def initialize(self):
                 vtype = 'weight'
                 if lname in ['key', 'query', 'value']:
                     data = data.transpose((1, 0, 2))
+                #     data = data.transpose((0, 2, 1)) ###
+                # if lname in ['attention_output']:
+                #     data = data.transpose((2,0,1)) ###
             else:
                 vtype = 'bias'
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 9426f826b0..968ad07c97 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -294,87 +294,6 @@ template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename
     }
 }
 
-// template<class data_T, class res_T, typename CONFIG_T>
-// void  softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-// {
-//     int exp_range = 4;
-//     int inv_range = 16;
-//     // Initialize the lookup table
-// #ifdef __HLS_SYN__
-//     bool initialized = false;
-//     typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-//     typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
-// #else
-//     static bool initialized = false;
-//     static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-//     static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
-// #endif
-//     if (!initialized) {
-//         init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
-//         init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
-//         initialized = true;
-//     }
-
-    
-
-//     // Index into the lookup table based on data for exponentials
-//     typename CONFIG_T::table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision
-//     typename CONFIG_T::table_t exp_diff_res;// different, independent, fixed point precision
-//     data_T data_cache[CONFIG_T::n_in];
-//     int data_round;
-//     int index;
-//     // for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-//     //     data_cache[ii] = data[ii];
-//     //     exp_res[ii] = 0;
-//     // }
-
-//     // for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-//     //     for (int jj=0; jj<CONFIG_T::n_in; jj++) {
-//     //         if (ii==jj) exp_diff_res = 1;
-//     //         else {
-//     //             data_round = (data_cache[jj]-data_cache[ii])*CONFIG_T::table_size/(exp_range*2);
-//     //             index = data_round + exp_range*CONFIG_T::table_size/(exp_range*2);
-//     //             if (index < 0)   index = 0;
-//     //             if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-//     //             exp_diff_res = exp_table[index];
-//     //         }
-//     //         exp_res[ii] += exp_diff_res;
-//     //     }
-//     // }
-
-//     // data_T one = 1.0;
-//     // //Second loop to invert
-//     // for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-//     //     int exp_res_index = exp_res[ii]*CONFIG_T::table_size/inv_range;
-//     //     if (exp_res_index < 0)   exp_res_index = 0;
-//     //     if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
-//     //     //typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
-//     //     res[ii] = (res_T) invert_table[exp_res_index];
-//     // }
-
-//     for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-//         #pragma HLS PIPELINE
-//         exp_res[ii] = 0;
-//         for (int jj=0; jj<CONFIG_T::n_in; jj++) {
-//             // #pragma HLS UNROLL
-//             if (ii==jj) exp_diff_res = 1;
-//             else {
-//                 data_round = (data[jj]-data[ii])*CONFIG_T::table_size/(exp_range*2);
-//                 index = data_round + exp_range*CONFIG_T::table_size/(exp_range*2);
-//                 if (index < 0)   index = 0;
-//                 if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-//                 exp_diff_res = exp_table[index];
-//             }
-//             exp_res[ii] += exp_diff_res;
-//         }
-
-//         int exp_res_index = exp_res[ii]*CONFIG_T::table_size/inv_range;
-//         if (exp_res_index < 0)   exp_res_index = 0;
-//         if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
-//         res[ii] = (res_T) invert_table[exp_res_index];
-//     }
-
-// }
 
 template <class data_T, class res_T, typename CONFIG_T>
 void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 6bb70220de..a715441111 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -119,6 +119,10 @@ void multiheadattention(
 #pragma HLS ARRAY_PARTITION variable=dense_in complete dim=2
 #pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=1
 
+    // std::cout << "input to MHA: " << std::endl;
+    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
+    // std::cout << " " << std::endl;
+
     // linear projection
     dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
 #pragma HLS DATAFLOW
@@ -140,10 +144,13 @@ void multiheadattention(
         nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj, k_proj, qk_mul);
         nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul, v_proj, dense_in, i);
     }
+    // std::cout << "output from MHA: " << std::endl;
 
     output_dense: for (int j=0; j <CONFIG_T::seq_len; ++j){ 
         dense<data_T, res_T, typename CONFIG_T::config_mult2>(dense_in[j], res+(CONFIG_T::feature_dim*j), attention_output_weight, attention_output_bias);
+        // nnet::print_result<result_t, CONFIG_T::feature_dim>( res+(CONFIG_T::feature_dim*j), std::cout);
     }
+    // std::cout << " " << std::endl;
 }
 }
 
diff --git a/hs_err_pid6927.log b/hs_err_pid6927.log
new file mode 100644
index 0000000000..108140956b
--- /dev/null
+++ b/hs_err_pid6927.log
@@ -0,0 +1,17 @@
+#
+# An unexpected error has occurred (11)
+#
+Stack:
+/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7fe2289e5f10]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(+0xb9519) [0x7fe22428e519]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_ResetResult+0x10) [0x7fe22428ef20]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(+0x34429) [0x7fe224209429]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_EvalEx+0x13) [0x7fe22420a0a3]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Eval+0x15) [0x7fe22420a0c5]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libhls_support.so(+0x75ac0) [0x7fe21094bac0]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Finalize+0x49) [0x7fe224249199]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Exit+0x4a) [0x7fe22424934a]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Main+0x24e) [0x7fe22427528e]
+/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/librdi_common.so(+0x8b30cb) [0x7fe229dfd0cb]
+/lib/x86_64-linux-gnu/libpthread.so.0(+0x76db) [0x7fe2221936db]
+/lib/x86_64-linux-gnu/libc.so.6(clone+0x3f) [0x7fe228ac861f]

From 5324a115cf017fde170f3e8f3db701fb27b655a6 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Fri, 30 Dec 2022 13:31:54 -0800
Subject: [PATCH 21/55] dense_muti_dim_support

---
 .../backends/vivado/passes/core_templates.py  |  1 +
 hls4ml/converters/keras/core.py               |  3 ++
 hls4ml/model/layers.py                        |  2 ++
 .../templates/vivado/nnet_utils/nnet_dense.h  | 26 +++++++++-----
 .../vivado/nnet_utils/nnet_dense_seq.h        | 34 +++++++++++++++++++
 5 files changed, 57 insertions(+), 9 deletions(-)
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 268293dd1e..0ccf2c22a1 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -8,6 +8,7 @@
 dense_config_template = """struct config{index} : nnet::dense_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
+    static const unsigned seq_len = {seq_len};
     static const unsigned io_type = nnet::{iotype};
     static const unsigned strategy = nnet::{strategy};
     static const unsigned reuse_factor = {reuse};
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index ca7d0b3541..3c4392ac22 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -47,6 +47,9 @@ def parse_dense_layer(keras_layer, input_names, input_shapes, data_reader):
         layer['bias_quantizer'] = None
     output_shape = input_shapes[0][:]
     output_shape[-1] = layer['n_out']
+    if len(input_shapes[0])==3:
+        layer['seq_len'] = output_shape[-2];
+    else: layer['seq_len'] = 1;
 
     return layer, output_shape
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 817a3256be..f16b0e8845 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -399,6 +399,8 @@ class Dense(Layer):
     _expected_attributes = [
         Attribute('n_in'),
         Attribute('n_out'),
+        Attribute('seq_len'),
+
         WeightAttribute('weight'),
         WeightAttribute('bias'),
         TypeAttribute('weight'),
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
index c5155d8485..5796e123e7 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
@@ -5,6 +5,7 @@
 #include "nnet_common.h"
 #include "nnet_dense_latency.h"
 #include "nnet_dense_resource.h"
+#include "nnet_dense_seq.h"
 #include "nnet_helpers.h"
 #include "nnet_mult.h"
 #include <math.h>
@@ -32,18 +33,25 @@ struct dense_config {
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
 
-template <class data_T, class res_T, typename CONFIG_T>
-void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+template<class data_T, class res_T, typename CONFIG_T>
+void dense(
+    data_T    data[CONFIG_T::n_in*CONFIG_T::seq_len],
+    res_T     res[CONFIG_T::n_out*CONFIG_T::seq_len],
+    typename CONFIG_T::weight_t  weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t    biases[CONFIG_T::n_out])
+{
     #pragma HLS inline
-    if (CONFIG_T::strategy == nnet::latency) {
-        dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    if (CONFIG_T::seq_len > 1) {
+        dense_seq<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else{
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        } else {
+            dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        }
     }
 }
 
-} // namespace nnet
+}
 
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
new file mode 100644
index 0000000000..7dc21f5d4c
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
@@ -0,0 +1,34 @@
+#ifndef NNET_DENSE_SEQ_H_
+#define NNET_DENSE_SEQ_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_helpers.h"
+#include "hls_stream.h"
+#include <math.h>
+
+namespace nnet {
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_seq(
+    data_T    data[CONFIG_T::n_in*CONFIG_T::seq_len],
+    res_T     res[CONFIG_T::n_out*CONFIG_T::seq_len],
+    typename CONFIG_T::weight_t  weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t    biases[CONFIG_T::n_out])
+{
+    #pragma HLS inline
+    if (CONFIG_T::strategy == nnet::latency) {
+        for (int j=0; j <CONFIG_T::seq_len; ++j){
+            dense_latency<data_T, res_T, CONFIG_T>(data+(CONFIG_T::n_in*j), res+(CONFIG_T::n_out*j), weights, biases);
+        }
+    } else {
+        for (int j=0; j <CONFIG_T::seq_len; ++j){
+            dense_resource<data_T, res_T, CONFIG_T>(data+(CONFIG_T::n_in*j), res+(CONFIG_T::n_out*j), weights, biases);
+        }
+    }
+
+}
+
+}
+
+#endif

From bf8c78868c46d6e1f4cb0b55ef108eff81b6a650 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Sun, 1 Jan 2023 14:47:15 -0800
Subject: [PATCH 22/55] parallel execute dense

---
 .../vivado/passes/transformer_templates.py    |   5 +-
 .../nnet_utils/nnet_multiheadattention.h      | 118 ++++++++++++++----
 2 files changed, 96 insertions(+), 27 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 6ec0baefe0..8881a496c6 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -7,6 +7,7 @@
 mult_config_template = """struct config{index}_{mNum} : nnet::dense_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
+    static const unsigned seq_len = {seq_len};
     static const unsigned strategy = nnet::{strategy};
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
@@ -82,6 +83,7 @@ def format(self, node):
         mult_params1['mNum'] = '1'
         mult_params1['n_in'] = node.get_attr('feature_dim')
         mult_params1['n_out'] = node.get_attr('head_dim_key')
+        mult_params1['seq_len'] = 1
         mult_params1['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('query_weight').type.precision)
         mult_params1['reuse'] = params['reuse']
         mult_params1['index'] = str(node.index)
@@ -93,6 +95,7 @@ def format(self, node):
         mult_params2['mNum'] = '2'
         mult_params2['n_in'] = node.get_attr('head_dim_value') * node.get_attr('num_heads')
         mult_params2['n_out'] = node.get_attr('feature_dim')
+        mult_params2['seq_len'] = 1
         mult_params2['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('attention_output_weight').type.precision)
         mult_params2['reuse'] = params['reuse']
         mult_params2['index'] = str(node.index)
@@ -103,7 +106,7 @@ def format(self, node):
         act_params = self._default_config_params(node)
         act_params['n_in'] = node.get_attr('seq_len')
         act_params['type'] = 'softmax'
-        act_params['implementation'] = 'legacy' #latency,stable not work， legacy works
+        act_params['implementation'] = 'legacy' #in MHA: latency,stable not work， legacy works
         act_config = self.activ1_template.format(**act_params)
 
         return mult_config1 + '\n' + mult_config2 + '\n' + act_config + '\n' + mha_config
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index a715441111..1010a5bed5 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -93,6 +93,52 @@ void matrixmul(
     }
 }
 
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_value(
+    data_T    data_v[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    data_T    v_proj [CONFIG_T::seq_len][CONFIG_T::head_dim_value],
+    typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
+    typename CONFIG_T::bias_t    value_bias[CONFIG_T::head_dim_value])
+{
+    data_T  v_row[CONFIG_T::head_dim_value];
+    #pragma HLS ARRAY_PARTITION variable=v_row complete dim=0
+    #pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=1
+    #pragma HLS function_instantiate variable=value_weight,value_bias
+    v_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_v+(CONFIG_T::feature_dim*j), v_row, value_weight, value_bias);
+        for (int k=0; k <CONFIG_T::head_dim_value; ++k){
+            v_proj[j][k]=v_row[k];
+        }
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_query(
+    data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    data_T    q_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
+    typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_value])
+{
+    #pragma HLS function_instantiate variable=query_weight,query_bias
+    q_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[j], query_weight, query_bias);
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_key(
+    data_T    data_k[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    data_T    k_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
+    typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_value])
+{
+    #pragma HLS function_instantiate variable=key_weight,key_bias
+    k_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_k+(CONFIG_T::feature_dim*j), k_proj[j], key_weight, key_bias);
+    }
+}
+
+
 template<class data_T, class res_T, typename CONFIG_T>
 void multiheadattention(
     data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
@@ -108,42 +154,62 @@ void multiheadattention(
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
 
-    data_T q_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    data_T v_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
-    data_T v_row[CONFIG_T::head_dim_value];
-    data_T k_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    data_T qk_mul[CONFIG_T::seq_len][CONFIG_T::seq_len];
+    data_T q_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    data_T v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
+    data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+
+    // data_T q_proj0[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    // data_T v_proj0[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
+    // data_T k_proj0[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    // data_T qk_mul0[CONFIG_T::seq_len][CONFIG_T::seq_len];
+
+    // data_T q_proj1[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    // data_T v_proj1[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
+    // data_T k_proj1[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    // data_T qk_mul1[CONFIG_T::seq_len][CONFIG_T::seq_len];
+
+    // data_T q_proj2[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    // data_T v_proj2[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
+    // data_T k_proj2[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+    // data_T qk_mul2[CONFIG_T::seq_len][CONFIG_T::seq_len];
 
     data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
-#pragma HLS ARRAY_PARTITION variable=v_row complete dim=0
+#pragma HLS DATAFLOW
 #pragma HLS ARRAY_PARTITION variable=dense_in complete dim=2
-#pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=1
-
+#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
+#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
     // std::cout << "input to MHA: " << std::endl;
     // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
     // std::cout << " " << std::endl;
 
     // linear projection
-    dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
-#pragma HLS DATAFLOW
-    	seq3: for (int j=0; j <CONFIG_T::seq_len; ++j){
-			dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), v_row, value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
-			for (int k=0; k <CONFIG_T::head_dim_value; ++k){
-				v_proj[j][k]=v_row[k];
-			}
-		}
-
-        seq1: for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[j], query_weight+(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
-        }
-        
-        seq2: for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_vk+(CONFIG_T::feature_dim*j), k_proj[j], key_weight  +(CONFIG_T::head_dim_key  *CONFIG_T::feature_dim*i), key_bias  +(CONFIG_T::head_dim_key*i));
-        }
+    dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[0], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*0), value_bias+(CONFIG_T::head_dim_value*0));
+    dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[1], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*1), value_bias+(CONFIG_T::head_dim_value*1));
+    dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[2], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*2), value_bias+(CONFIG_T::head_dim_value*2));
+
+    dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[0], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*0), query_bias+(CONFIG_T::head_dim_key*0));
+    dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[1], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*1), query_bias+(CONFIG_T::head_dim_key*1));
+    dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[2], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*2), query_bias+(CONFIG_T::head_dim_key*2));
 
-        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj, k_proj, qk_mul);
-        nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul, v_proj, dense_in, i);
+    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[0], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*0), key_bias+(CONFIG_T::head_dim_key*0));
+    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[1], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*1), key_bias+(CONFIG_T::head_dim_key*1));
+    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[2], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*2), key_bias+(CONFIG_T::head_dim_key*2));
+
+    nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[0], k_proj[0], qk_mul[0]);
+    nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[1], k_proj[1], qk_mul[1]);
+    nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[2], k_proj[2], qk_mul[2]);
+    // nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[0], v_proj[0], dense_in, 0);
+    // nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[1], v_proj[1], dense_in, 1);
+    // nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[2], v_proj[2], dense_in, 2);
+
+    dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
+        // nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+        nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
     }
+
     // std::cout << "output from MHA: " << std::endl;
 
     output_dense: for (int j=0; j <CONFIG_T::seq_len; ++j){ 

From b6be2c43ef16ad4359f9985a17d2f9f0c4400236 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Fri, 27 Jan 2023 07:47:46 -0800
Subject: [PATCH 23/55] updates

---
 .../nnet_utils/nnet_multiheadattention.h      | 157 +++++++++++-------
 1 file changed, 94 insertions(+), 63 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 1010a5bed5..c00124fd7e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -37,30 +37,35 @@ struct multiheadattention_config
 
 template<class data_T, class res_T, typename CONFIG_T>
 void matrixmul_transpose(
-    data_T  Q[CONFIG_T::seq_len][CONFIG_T::head_dim_key], 
-    data_T  K[CONFIG_T::seq_len][CONFIG_T::head_dim_key], 
+	hls::stream<data_T>  Q[CONFIG_T::head_dim_key],
+	data_T  K[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
     res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
     const data_T dk = 1.0/sqrt(CONFIG_T::head_dim_key);
-    data_T QKij;
+    data_T QKij, QK_1;
+    data_T Qi[CONFIG_T::head_dim_key];
     data_T Product[CONFIG_T::seq_len];
-#pragma HLS ARRAY_RESHAPE variable=K cyclic factor=2 dim=1
-#pragma HLS ARRAY_RESHAPE variable=Q cyclic factor=2 dim=1
+#pragma HLS ARRAY_RESHAPE variable=K cyclic factor=4 dim=1
 #pragma HLS ARRAY_RESHAPE variable=K complete dim=2
-#pragma HLS ARRAY_RESHAPE variable=Q complete dim=2
+#pragma HLS ARRAY_RESHAPE variable=Qi complete dim=0
 #pragma HLS ARRAY_PARTITION variable=Product complete
 
 #pragma HLS ARRAY_PARTITION variable=QK complete dim=2
 
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+    	q: for(int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i){
+		#pragma HLS UNROLL
+    		Qi[q_i]=Q[q_i].read();
+    	}
         col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
 #pragma HLS PIPELINE
-#pragma HLS UNROLL factor=4
+#pragma HLS UNROLL factor=8
             // compute (QK)i,j
             QKij = 0;
             product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-                QKij += Q[i][k] * K[j][k];
+            	QK_1 = Qi[k] * K[j][k];
+                QKij += QK_1;
             }
             Product[j] = QKij * dk;
         }
@@ -69,26 +74,30 @@ void matrixmul_transpose(
 }
 
 
-template<class data_T, class res_T, typename CONFIG_T, class T>
+template<class data_T, class res_T, typename CONFIG_T>
 void matrixmul(
     data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], 
     data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value], 
-    res_T   S[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value],
-    T       head) // S: attention score
+	hls::stream<data_T>   S[CONFIG_T::head_dim_value]) // S: attention score
 {
+//	data_T Product[CONFIG_T::head_dim_value];
 	#pragma HLS ARRAY_Partition variable=QK complete dim=2
+	#pragma HLS ARRAY_Partition variable=S complete dim=1
     #pragma HLS ARRAY_RESHAPE variable=V complete dim=1
+//	#pragma HLS ARRAY_PARTITION variable=Product complete
+//	#pragma HLS ARRAY_Partition variable=V cyclic factor=2 dim=2 //
     // for each row and column of AB
     data_T Sij;
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
         col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
 #pragma HLS PIPELINE
+#pragma HLS UNROLL factor=2
             // compute (S)i,j
             Sij = 0;
             product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
                 Sij += QK[i][k] * V[k][j];
             }
-            S[i][CONFIG_T::head_dim_value*head+j] = Sij;
+            S[j].write(Sij);
         }
     }
 }
@@ -103,8 +112,9 @@ void dense_value(
     data_T  v_row[CONFIG_T::head_dim_value];
     #pragma HLS ARRAY_PARTITION variable=v_row complete dim=0
     #pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=1
-    #pragma HLS function_instantiate variable=value_weight,value_bias
+//    #pragma HLS function_instantiate variable=value_weight,value_bias
     v_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
+#pragma HLS DATAFLOW
         dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_v+(CONFIG_T::feature_dim*j), v_row, value_weight, value_bias);
         for (int k=0; k <CONFIG_T::head_dim_value; ++k){
             v_proj[j][k]=v_row[k];
@@ -115,13 +125,21 @@ void dense_value(
 template<class data_T, class res_T, typename CONFIG_T>
 void dense_query(
     data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    data_T    q_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
-    typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_value])
+	hls::stream<data_T>    q_proj[CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
+    typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_key])
 {
-    #pragma HLS function_instantiate variable=query_weight,query_bias
+//    #pragma HLS function_instantiate variable=query_weight,query_bias
+	data_T proj[CONFIG_T::head_dim_key];
+	#pragma HLS ARRAY_PARTITION variable=proj complete dim=0
+	#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=0
     q_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), q_proj[j], query_weight, query_bias);
+#pragma HLS DATAFLOW
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), proj, query_weight, query_bias);
+        update_proj: for (int i=0; i<CONFIG_T::head_dim_key; ++i){
+		#pragma HLS UNROLL
+        	q_proj[i].write(proj[i]);
+        }
     }
 }
 
@@ -129,16 +147,47 @@ template<class data_T, class res_T, typename CONFIG_T>
 void dense_key(
     data_T    data_k[CONFIG_T::seq_len * CONFIG_T::feature_dim],
     data_T    k_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
-    typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_value])
+    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
+    typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_key])
 {
-    #pragma HLS function_instantiate variable=key_weight,key_bias
+	data_T  k_row[CONFIG_T::head_dim_key];
+#pragma HLS ARRAY_PARTITION variable=k_row complete dim=0
+//    #pragma HLS function_instantiate variable=key_weight,key_bias
     k_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_k+(CONFIG_T::feature_dim*j), k_proj[j], key_weight, key_bias);
+#pragma HLS DATAFLOW
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_k+(CONFIG_T::feature_dim*j), k_row, key_weight, key_bias);
+        //k_proj[j] = k_row;
+        for (int k=0; k <CONFIG_T::head_dim_key; ++k){
+#pragma HLS UNROLL
+        	k_proj[j][k]=k_row[k];
+		}
     }
 }
 
 
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_out(
+	hls::stream<data_T> data_in[CONFIG_T::num_heads][CONFIG_T::head_dim_value],
+	res_T     res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],
+    typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::feature_dim])
+{
+//    #pragma HLS function_instantiate variable=query_weight,query_bias
+	data_T mat_res_con[CONFIG_T::num_heads*CONFIG_T::head_dim_value];
+	#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=0
+	output_dense: for (int k=0; k <CONFIG_T::seq_len; ++k){
+		for (int i=0;i<CONFIG_T::num_heads; ++i){
+			for (int j=0;j<CONFIG_T::head_dim_value; ++j){
+			#pragma HLS UNROLL
+				mat_res_con[CONFIG_T::head_dim_value*i+j]=data_in[i][j].read();
+			}
+		}
+		dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, res+(CONFIG_T::feature_dim*k), attention_output_weight, attention_output_bias);
+		// nnet::print_result<result_t, CONFIG_T::feature_dim>( res+(CONFIG_T::feature_dim*j), std::cout);
+	}
+}
+
+
 template<class data_T, class res_T, typename CONFIG_T>
 void multiheadattention(
     data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
@@ -154,68 +203,50 @@ void multiheadattention(
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
 
-    data_T q_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+	hls::stream<data_T> q_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_key];
     data_T v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
     data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
     data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+//    data_T mat_res_con[CONFIG_T::num_heads*CONFIG_T::head_dim_value];
+//    data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
+    hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
 
-    // data_T q_proj0[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    // data_T v_proj0[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
-    // data_T k_proj0[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    // data_T qk_mul0[CONFIG_T::seq_len][CONFIG_T::seq_len];
-
-    // data_T q_proj1[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    // data_T v_proj1[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
-    // data_T k_proj1[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    // data_T qk_mul1[CONFIG_T::seq_len][CONFIG_T::seq_len];
-
-    // data_T q_proj2[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    // data_T v_proj2[CONFIG_T::seq_len][CONFIG_T::head_dim_value];
-    // data_T k_proj2[CONFIG_T::seq_len][CONFIG_T::head_dim_key];
-    // data_T qk_mul2[CONFIG_T::seq_len][CONFIG_T::seq_len];
-
-    data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
 #pragma HLS DATAFLOW
-#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=2
 #pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
 #pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
 #pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
 #pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
+#pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
     // std::cout << "input to MHA: " << std::endl;
     // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
     // std::cout << " " << std::endl;
 
     // linear projection
-    dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[0], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*0), value_bias+(CONFIG_T::head_dim_value*0));
-    dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[1], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*1), value_bias+(CONFIG_T::head_dim_value*1));
-    dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[2], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*2), value_bias+(CONFIG_T::head_dim_value*2));
-
-    dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[0], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*0), query_bias+(CONFIG_T::head_dim_key*0));
-    dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[1], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*1), query_bias+(CONFIG_T::head_dim_key*1));
-    dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[2], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*2), query_bias+(CONFIG_T::head_dim_key*2));
-
-    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[0], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*0), key_bias+(CONFIG_T::head_dim_key*0));
-    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[1], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*1), key_bias+(CONFIG_T::head_dim_key*1));
-    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[2], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*2), key_bias+(CONFIG_T::head_dim_key*2));
-
+    for (int i=0;i<CONFIG_T::num_heads; ++i){
+#pragma HLS UNROLL
+    	dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[i], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+    }
+    for (int i=0;i<CONFIG_T::num_heads; ++i){
+#pragma HLS UNROLL
+    	dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[i], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+    }
+    for (int i=0;i<CONFIG_T::num_heads; ++i){
+#pragma HLS UNROLL
+    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[i], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i));
+    }
+    
     nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[0], k_proj[0], qk_mul[0]);
     nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[1], k_proj[1], qk_mul[1]);
     nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[2], k_proj[2], qk_mul[2]);
-    // nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[0], v_proj[0], dense_in, 0);
-    // nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[1], v_proj[1], dense_in, 1);
-    // nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[2], v_proj[2], dense_in, 2);
 
-    dense_for_each_head: for (int i=0; i < CONFIG_T::num_heads; ++i){
-        // nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
-        nnet::matrixmul<data_T, res_T, CONFIG_T, int>(qk_mul[i], v_proj[i], dense_in, i);
+
+    maxtrixmul2: for (int i=0; i < CONFIG_T::num_heads; ++i){
+#pragma HLS UNROLL
+    	nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]);//stream
     }
 
+    dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
     // std::cout << "output from MHA: " << std::endl;
-
-    output_dense: for (int j=0; j <CONFIG_T::seq_len; ++j){ 
-        dense<data_T, res_T, typename CONFIG_T::config_mult2>(dense_in[j], res+(CONFIG_T::feature_dim*j), attention_output_weight, attention_output_bias);
-        // nnet::print_result<result_t, CONFIG_T::feature_dim>( res+(CONFIG_T::feature_dim*j), std::cout);
-    }
     // std::cout << " " << std::endl;
 }
 }

From 2472b7dca5eb3f7bd9e228c13b469c406d0bf9e9 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Wed, 15 Feb 2023 12:07:53 -0800
Subject: [PATCH 24/55] add_layerNorm_support

---
 .../backends/vivado/passes/core_templates.py  |  48 +++-
 hls4ml/backends/vivado/vivado_backend.py      |   7 +
 hls4ml/converters/keras/core.py               |  29 ++-
 hls4ml/model/layers.py                        | 138 +++++++-----
 .../vivado/nnet_utils/nnet_layernorm.h        | 165 ++++++++++++++
 .../nnet_utils/nnet_multiheadattention.h      | 208 ++++++++++++++----
 6 files changed, 493 insertions(+), 102 deletions(-)
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 0ccf2c22a1..772492f68b 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -1,6 +1,6 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+from hls4ml.model.layers import Activation, BatchNormalization, LayerNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
 from hls4ml.model.optimizer.passes.hgq_proxy_model import UnaryLUT
 
 # Dense templates
@@ -98,7 +98,53 @@ class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(BatchNormalization, include_header=batchnorm_include_list)
         self.template = batchnorm_function_template
+    
+    def format(self, node):
+        params = self._default_function_params(node)
+        params['scale'] = node.get_weights('scale').name
+        params['bias'] = node.get_weights('bias').name
+
+        return self.template.format(**params)
+
+
+# LayerNormalization templates
 
+layernorm_config_template = """struct config{index} : nnet::layernorm_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned seq_len = {seq_len};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+    typedef {bias_t.name} bias_t;
+    typedef {scale_t.name} scale_t;
+    typedef {table_t.name} table_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+layernorm_function_template = 'nnet::layernormalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
+
+layernorm_include_list = ['nnet_utils/nnet_layernorm.h']
+
+class LayerNormalizationConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(LayerNormalization)
+        self.template = layernorm_config_template
+    
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['n_in'] = node.get_input_variable().size_cpp()
+        params['seq_len'] = node.get_attr('seq_len')
+        params['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('scale').type.precision)
+
+        return self.template.format(**params)
+
+class LayerNormalizationFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(LayerNormalization, include_header=layernorm_include_list)
+        self.template = layernorm_function_template
+    
     def format(self, node):
         params = self._default_function_params(node)
         params['scale'] = node.get_weights('scale').name
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 298218734f..d2cfd79bfc 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -511,6 +511,13 @@ def init_softmax(self, layer):
                 len(layer.get_input_variable().shape) == 1
             ), 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
 
+    @layer_optimizer(LayerNormalization)
+    def init_layernormalization(self, layer):
+        if 'table_t' not in layer.attributes:
+            layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=5)))
+        if 'table_size' not in layer.attributes:
+            layer.set_attr('table_size', 2048)
+
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
         if layer.attributes['n_in'] is None:
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 3c4392ac22..e8da1fc102 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -48,8 +48,8 @@ def parse_dense_layer(keras_layer, input_names, input_shapes, data_reader):
     output_shape = input_shapes[0][:]
     output_shape[-1] = layer['n_out']
     if len(input_shapes[0])==3:
-        layer['seq_len'] = output_shape[-2];
-    else: layer['seq_len'] = 1;
+        layer['seq_len'] = output_shape[-2]
+    else: layer['seq_len'] = 1
 
     return layer, output_shape
 
@@ -123,6 +123,31 @@ def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader):
     return layer, [shape for shape in input_shapes[0]]
 
 
+@keras_handler('LayerNormalization')
+def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    assert('LayerNormalization' in keras_layer['class_name'])
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    in_size = 1
+    for dim in input_shapes[0][1:]:
+        in_size *= dim
+
+    layer['axis'] = keras_layer['config']['axis'] if (keras_layer['config']['axis'][0]==2) else False
+    if layer['axis'] is False:
+        raise Exception('assigning the axis is not currently supported by hls4ml, only axis 2 is supported')
+
+    if not((len(input_shapes[0])) == 3 ):
+        raise Exception('input size is not currently supported by hls4ml, only dim3 is supported')
+    if len(input_shapes[0])==3:
+        layer['seq_len'] = input_shapes[0][-2]
+    else: layer['seq_len'] = 1
+    layer['n_in'] = in_size
+    layer['n_out'] = layer['n_in']
+
+    return layer, [shape for shape in input_shapes[0]]
+
+
 @keras_handler('Embedding')
 def parse_embedding_layer(keras_layer, input_names, input_shapes, data_reader):
     assert 'Embedding' in keras_layer['class_name']
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index f16b0e8845..742d03a5ac 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -928,6 +928,34 @@ def initialize(self):
         self.add_weights_variable(name='scale', var_name='s{index}', data=scale)
         self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
 
+class LayerNormalization(Layer):
+    _expected_attributes = [
+        Attribute('n_in'),
+        # Attribute('axis', default=-1),
+        Attribute('seq_len'),
+        WeightAttribute('scale'),
+        WeightAttribute('bias'),
+
+        TypeAttribute('scale'),
+        TypeAttribute('bias'),
+    ]
+
+    def initialize(self):
+        inp = self.get_input_variable()
+        shape = inp.shape
+        dims = inp.dim_names
+        self.add_output_variable(shape, dims)
+
+        gamma = self.model.get_weights_data(self.name, 'gamma')
+        beta = self.model.get_weights_data(self.name, 'beta')
+
+        scale = gamma
+        bias = beta
+
+        self.add_weights_variable(name='scale', var_name='s{index}', data=scale)
+        self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
+
+
 
 class Merge(Layer):
     def initialize(self):
@@ -1475,66 +1503,58 @@ def initialize(self):
         self.add_output_variable(shape, dims)
 
 layer_map = {
-    'Input': Input,
-    'InputLayer': Input,
-    'Activation': Activation,
-    'QActivation': Activation,
-    'LeakyReLU': ParametrizedActivation,
-    'ThresholdedReLU': ParametrizedActivation,
-    'ELU': ParametrizedActivation,
-    'PReLU': PReLU,
-    'Softmax': Softmax,
-    'TernaryTanh': TernaryTanh,
-    'HardActivation': HardActivation,
-    'Reshape': Reshape,
-    'Dense': Dense,
-    'BinaryDense': Dense,
-    'TernaryDense': Dense,
-    'QDense': Dense,
-    'Conv1D': Conv1D,
-    'QConv1D': Conv1D,
-    'Conv2D': Conv2D,
-    'BinaryConv2D': Conv2D,
-    'QConv2D': Conv2D,
-    'QConv2DBatchnorm': Conv2DBatchnorm,
-    'SeparableConv1D': SeparableConv1D,
-    'QSeparableConv1D': SeparableConv1D,
-    'DepthwiseConv1D': DepthwiseConv1D,
-    'SeparableConv2D': SeparableConv2D,
-    'QSeparableConv2D': SeparableConv2D,
-    'DepthwiseConv2D': DepthwiseConv2D,
-    'QDepthwiseConv2D': DepthwiseConv2D,
-    'BatchNormalization': BatchNormalization,
-    'QBatchNormalization': BatchNormalization,
-    'MaxPooling1D': Pooling1D,
-    'AveragePooling1D': Pooling1D,
-    'MaxPooling2D': Pooling2D,
-    'AveragePooling2D': Pooling2D,
-    'GlobalMaxPooling1D': GlobalPooling1D,
-    'GlobalAveragePooling1D': GlobalPooling1D,
-    'GlobalMaxPooling2D': GlobalPooling2D,
-    'GlobalAveragePooling2D': GlobalPooling2D,
-    'ZeroPadding1D': ZeroPadding1D,
-    'ZeroPadding2D': ZeroPadding2D,
-    'Merge': Merge,
-    'Dot': Dot,
-    'Concatenate': Concatenate,
-    'Resize': Resize,
-    'UpSampling1D': Resize,
-    'UpSampling2D': Resize,
-    'Transpose': Transpose,
-    'Embedding': Embedding,
-    'SimpleRNN': SimpleRNN,
-    'LSTM': LSTM,
-    'GRU': GRU,
-    'QSimpleRNN': SimpleRNN,
-    'QLSTM': LSTM,
-    'QGRU': GRU,
-    'GarNet': GarNet,
-    'GarNetStack': GarNetStack,
-    'LayerGroup': LayerGroup,
-    'SymbolicExpression': SymbolicExpression,
+    'Input'                  : Input,
+    'InputLayer'             : Input,
+    'Activation'             : Activation,
+    'QActivation'            : Activation,
+    'LeakyReLU'              : ParametrizedActivation,
+    'ThresholdedReLU'        : ParametrizedActivation,
+    'ELU'                    : ParametrizedActivation,
+    'PReLU'                  : PReLU,
+    'Softmax'                : Softmax,
+    'TernaryTanh'            : TernaryTanh,
+    'Reshape'                : Reshape,
+    'Dense'                  : Dense,
+    'BinaryDense'            : Dense,
+    'TernaryDense'           : Dense,
+    'QDense'                 : Dense,
+    'Conv1D'                 : Conv1D,
+    'QConv1D'                : Conv1D,
+    'Conv2D'                 : Conv2D,
+    'BinaryConv2D'           : Conv2D,
+    'QConv2D'                : Conv2D,
+    'QConv2DBatchnorm'       : Conv2DBatchnorm,
+    'SeparableConv1D'        : SeparableConv1D,
+    'SeparableConv2D'        : SeparableConv2D,
+    'DepthwiseConv2D'        : DepthwiseConv2D,
+    'BatchNormalization'     : BatchNormalization,
+    'QBatchNormalization'    : BatchNormalization,
+    'MaxPooling1D'           : Pooling1D,
+    'AveragePooling1D'       : Pooling1D,
+    'MaxPooling2D'           : Pooling2D,
+    'AveragePooling2D'       : Pooling2D,
+    'GlobalMaxPooling1D'     : GlobalPooling1D,
+    'GlobalAveragePooling1D' : GlobalPooling1D,
+    'GlobalMaxPooling2D'     : GlobalPooling2D,
+    'GlobalAveragePooling2D' : GlobalPooling2D,
+    'ZeroPadding1D'          : ZeroPadding1D,
+    'ZeroPadding2D'          : ZeroPadding2D,
+    'Merge'                  : Merge,
+    'Dot'                    : Dot,
+    'Concatenate'            : Concatenate,
+    'Resize'                 : Resize,
+    'UpSampling1D'           : Resize,
+    'UpSampling2D'           : Resize,
+    'Transpose'              : Transpose,
+    'Embedding'              : Embedding,
+    'SimpleRNN'              : SimpleRNN,
+    'LSTM'                   : LSTM,
+    'GRU'                    : GRU,
+    'GarNet'                 : GarNet,
+    'GarNetStack'            : GarNetStack,
     'MultiHeadAttention'     : MultiHeadAttention,
+    'LayerNormalization'     : LayerNormalization,
+
 
     # TensorFlow-specific layers:
     'BiasAdd': BiasAdd,
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
new file mode 100644
index 0000000000..d49053d083
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -0,0 +1,165 @@
+//
+//    rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
+//
+//    Copyright (C) 2017 EJ Kreinar
+//
+//    This program is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    This program is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU General Public License for more details.
+//
+//    You should have received a copy of the GNU General Public License
+//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "hls_stream.h"
+#include <math.h>
+#include <iostream>
+
+namespace nnet {
+
+struct layernorm_config
+{
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 20;
+    static const unsigned seq_len = 4;
+    
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    
+    template<class x_T, class y_T>
+    using product = nnet::product::mult<x_T, y_T>;
+};
+
+template<typename CONFIG_T, int N_TABLE>
+void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
+{
+    float inv_range = 0.01;
+    // Inversion function:
+    //   result = 1/sqrt(x)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +2)
+        float in_val = inv_range*ii/float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0) table_out[ii] = 1.0/sqrt(in_val);
+        else table_out[ii] = 0.0;
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void layernorm_1d(
+    data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+    res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+    typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+    typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+)
+{   
+
+int inv_range_inv = (int) 1/0.01;
+typename CONFIG_T::table_t deno_inver = 0;
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
+        initialized = true;
+    }
+
+    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+    data_T sum_cache = 0;
+    data_T sum_cache2 = 0; 
+    data_T var, mean, diff;
+    data_T data_diff[dim];
+    data_T data_norm[dim];
+    
+    const data_T k_inv = 1.0/dim;
+    for (int i = 0; i < dim; ++i){
+        sum_cache += data[i];
+    }
+    mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+    // std::cout << "mean: " << std::endl;
+    // std::cout << mean << std::endl;
+    
+    for (int i = 0; i < dim; ++i){
+        data_diff[i] = data[i] - mean;
+        diff = data_diff[i]*data_diff[i];
+        sum_cache2 += diff;
+        // std::cout << "data_diff: " << std::endl;
+        // std::cout << data_diff[i] << std::endl;
+        // std::cout << " " << std::endl;
+    }
+    var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+    // std::cout << "var: " << std::endl;
+    // std::cout << var << std::endl;
+    // std::cout << " " << std::endl;
+
+    int index = var*(CONFIG_T::table_size)*inv_range_inv;
+	if (index < 0)   index = 0;
+	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+	deno_inver = (typename CONFIG_T::table_t) invert_sqr_table[index];
+    // std::cout << "deno_inver: " << std::endl;
+    // std::cout << deno_inver << std::endl;
+    // std::cout << " " << std::endl;
+
+
+    for (int i = 0; i < dim; ++i){
+        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
+    }
+
+}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void layernormalize(
+    data_T    data[CONFIG_T::n_in],
+    res_T     res[CONFIG_T::n_in],
+    typename CONFIG_T::scale_t  scale[CONFIG_T::n_scale_bias],
+    typename CONFIG_T::bias_t   bias[CONFIG_T::n_scale_bias]
+)
+{
+    data_T cache;
+    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=scale,bias
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+
+    for (int j=0; j <CONFIG_T::seq_len; ++j){
+        layernorm_1d<data_T, res_T, CONFIG_T>(data+(dim*j), res+(dim*j), scale, bias);
+    }
+
+
+}
+
+}
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index c00124fd7e..706aec0929 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -45,11 +45,12 @@ void matrixmul_transpose(
     data_T QKij, QK_1;
     data_T Qi[CONFIG_T::head_dim_key];
     data_T Product[CONFIG_T::seq_len];
+    data_T qk_smout[CONFIG_T::seq_len];
 #pragma HLS ARRAY_RESHAPE variable=K cyclic factor=4 dim=1
 #pragma HLS ARRAY_RESHAPE variable=K complete dim=2
 #pragma HLS ARRAY_RESHAPE variable=Qi complete dim=0
 #pragma HLS ARRAY_PARTITION variable=Product complete
-
+#pragma HLS ARRAY_PARTITION variable=qk_smout complete
 #pragma HLS ARRAY_PARTITION variable=QK complete dim=2
 
     // for each row and column of AB
@@ -59,8 +60,8 @@ void matrixmul_transpose(
     		Qi[q_i]=Q[q_i].read();
     	}
         col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
-#pragma HLS PIPELINE
-#pragma HLS UNROLL factor=8
+		#pragma HLS PIPELINE
+		#pragma HLS UNROLL factor=8
             // compute (QK)i,j
             QKij = 0;
             product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
@@ -69,15 +70,19 @@ void matrixmul_transpose(
             }
             Product[j] = QKij * dk;
         }
-        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, QK[i]);
+        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, qk_smout);
+        for(int n = 0; n < CONFIG_T::seq_len; ++n) {
+		#pragma HLS UNROLL
+        	QK[i][n]=qk_smout[n];
+        }
     }
 }
 
 
 template<class data_T, class res_T, typename CONFIG_T>
 void matrixmul(
-    data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], 
-    data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value], 
+    data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len],
+    data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value],
 	hls::stream<data_T>   S[CONFIG_T::head_dim_value]) // S: attention score
 {
 //	data_T Product[CONFIG_T::head_dim_value];
@@ -104,38 +109,111 @@ void matrixmul(
 
 template<class data_T, class res_T, typename CONFIG_T>
 void dense_value(
-    data_T    data_v[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    data_T    v_proj [CONFIG_T::seq_len][CONFIG_T::head_dim_value],
+	hls::stream<data_T>    data_v[CONFIG_T::feature_dim],
+	hls::stream<data_T>    v_proj[CONFIG_T::head_dim_value],
     typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::head_dim_value])
 {
-    data_T  v_row[CONFIG_T::head_dim_value];
-    #pragma HLS ARRAY_PARTITION variable=v_row complete dim=0
-    #pragma HLS ARRAY_RESHAPE variable=v_proj complete dim=1
-//    #pragma HLS function_instantiate variable=value_weight,value_bias
+#pragma HLS ARRAY_PARTITION variable=data_v complete dim=1
+#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+
     v_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-#pragma HLS DATAFLOW
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_v+(CONFIG_T::feature_dim*j), v_row, value_weight, value_bias);
+	#pragma HLS DATAFLOW
+    	data_T v_row[CONFIG_T::head_dim_value];
+    	data_T dense_in[CONFIG_T::feature_dim];
+		#pragma HLS ARRAY_PARTITION variable=v_row complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
+for (int k=0; k<CONFIG_T::feature_dim; ++k){
+		#pragma HLS UNROLL
+    		dense_in[k] = data_v[k].read();
+    	}
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, v_row, value_weight, value_bias);
         for (int k=0; k <CONFIG_T::head_dim_value; ++k){
-            v_proj[j][k]=v_row[k];
+		#pragma HLS UNROLL
+            v_proj[k].write(v_row[k]);
         }
     }
 }
+//
+//template<class data_T, class res_T, typename CONFIG_T>
+//void value_prep(
+//	hls::stream<data_T>    v_proj[CONFIG_T::head_dim_value],
+//	data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value])
+//{
+//	data_T col[CONFIG_T::head_dim_value][CONFIG_T::seq_len];
+//	#pragma HLS ARRAY_PARTITION variable=col complete dim=1
+//	#pragma HLS ARRAY_PARTITION variable=col complete dim=2
+//
+//	for (int i=0; i <CONFIG_T::seq_len; ++i){
+//		for (int j=0; j <CONFIG_T::head_dim_value; ++j){
+//		#pragma HLS UNROLL
+//			col[j][i]=v_proj[j].read();
+//		}
+//	}
+//	for (int i=0; i <CONFIG_T::head_dim_value; ++i){
+//		for (int j=0; j <CONFIG_T::seq_len; ++j){
+//		#pragma HLS UNROLL
+//			V[j][i]=col[i][j];
+//		}
+//	}
+//}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void value_prep(
+	hls::stream<data_T>    v_proj[CONFIG_T::head_dim_value],
+	data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value])
+{
+//	data_T col[CONFIG_T::head_dim_value][CONFIG_T::seq_len];
+//	#pragma HLS ARRAY_PARTITION variable=col complete dim=1
+//	#pragma HLS ARRAY_PARTITION variable=col complete dim=2
+
+//	for (int i=0; i <CONFIG_T::seq_len; ++i){
+//		for (int j=0; j <CONFIG_T::head_dim_value; ++j){
+//		#pragma HLS UNROLL
+//			col[j][i]=v_proj[j].read();
+//		}
+//	}
+	data_T data[CONFIG_T::seq_len*CONFIG_T::head_dim_value];
+	# pragma HLS ARRAY_PARTITION variable=col complete dim=1
+	for (int i=0; i <CONFIG_T::head_dim_value; ++i){
+	#pragma HLS UNROLL
+		for (int j=0; j <CONFIG_T::seq_len; ++j){
+			data[CONFIG_T::seq_len*i+j]=v_proj[i].read();
+		}
+	}
+	for (int i=0; i <CONFIG_T::head_dim_value; ++i){
+		for (int j=0; j <CONFIG_T::seq_len; ++j){
+		#pragma HLS UNROLL
+			V[j][i]=data[CONFIG_T::seq_len*i+j];
+		}
+	}
+}
+
+
 
 template<class data_T, class res_T, typename CONFIG_T>
 void dense_query(
-    data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+	hls::stream<data_T>    data_q[CONFIG_T::feature_dim],
 	hls::stream<data_T>    q_proj[CONFIG_T::head_dim_key],
     typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
     typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_key])
 {
-//    #pragma HLS function_instantiate variable=query_weight,query_bias
-	data_T proj[CONFIG_T::head_dim_key];
-	#pragma HLS ARRAY_PARTITION variable=proj complete dim=0
-	#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=0
+#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
+#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
+
     q_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-#pragma HLS DATAFLOW
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_q +(CONFIG_T::feature_dim*j), proj, query_weight, query_bias);
+	#pragma HLS DATAFLOW
+    	data_T proj[CONFIG_T::head_dim_key];
+		data_T dense_in [CONFIG_T::feature_dim];
+		#pragma HLS ARRAY_PARTITION variable=proj complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
+
+	for (int k=0; k<CONFIG_T::feature_dim; ++k){
+	#pragma HLS UNROLL
+		dense_in[k] = data_q[k].read();
+	}
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, proj, query_weight, query_bias);
         update_proj: for (int i=0; i<CONFIG_T::head_dim_key; ++i){
 		#pragma HLS UNROLL
         	q_proj[i].write(proj[i]);
@@ -145,20 +223,28 @@ void dense_query(
 
 template<class data_T, class res_T, typename CONFIG_T>
 void dense_key(
-    data_T    data_k[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+	hls::stream<data_T>    data_k[CONFIG_T::feature_dim],
     data_T    k_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
     typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
     typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_key])
 {
-	data_T  k_row[CONFIG_T::head_dim_key];
-#pragma HLS ARRAY_PARTITION variable=k_row complete dim=0
+#pragma HLS ARRAY_PARTITION variable=data_k complete dim=1
+#pragma HLS ARRAY_RESHAPE variable=k_proj complete dim=2
 //    #pragma HLS function_instantiate variable=key_weight,key_bias
     k_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-#pragma HLS DATAFLOW
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(data_k+(CONFIG_T::feature_dim*j), k_row, key_weight, key_bias);
+	#pragma HLS DATAFLOW
+    	data_T k_row[CONFIG_T::head_dim_key];
+    	data_T dense_in[CONFIG_T::feature_dim];
+		#pragma HLS ARRAY_PARTITION variable=k_row complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
+		for (int k=0; k<CONFIG_T::feature_dim; ++k){
+		#pragma HLS UNROLL
+			dense_in[k] = data_k[k].read();
+		}
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, k_row, key_weight, key_bias);
         //k_proj[j] = k_row;
         for (int k=0; k <CONFIG_T::head_dim_key; ++k){
-#pragma HLS UNROLL
+		#pragma HLS UNROLL
         	k_proj[j][k]=k_row[k];
 		}
     }
@@ -174,9 +260,10 @@ void dense_out(
 {
 //    #pragma HLS function_instantiate variable=query_weight,query_bias
 	data_T mat_res_con[CONFIG_T::num_heads*CONFIG_T::head_dim_value];
-	#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=0
+	#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
 	output_dense: for (int k=0; k <CONFIG_T::seq_len; ++k){
 		for (int i=0;i<CONFIG_T::num_heads; ++i){
+		#pragma HLS UNROLL
 			for (int j=0;j<CONFIG_T::head_dim_value; ++j){
 			#pragma HLS UNROLL
 				mat_res_con[CONFIG_T::head_dim_value*i+j]=data_in[i][j].read();
@@ -187,6 +274,29 @@ void dense_out(
 	}
 }
 
+template<class data_T, class res_T, typename CONFIG_T>
+void data_prep(
+	data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+	data_T    data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+	hls::stream<data_T> d_value[CONFIG_T::feature_dim],
+	hls::stream<data_T> d_query[CONFIG_T::feature_dim],
+	hls::stream<data_T> d_key[CONFIG_T::feature_dim])
+{
+#pragma HLS ARRAY_PARTITION variable=d_value complete dim=1
+#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
+#pragma HLS ARRAY_PARTITION variable=d_key complete dim=1
+
+	for (int j=0; j<CONFIG_T::seq_len; ++j){
+		#pragma HLS PIPELINE
+		for (int k=0; k<CONFIG_T::feature_dim; ++k){
+		#pragma HLS DATAFLOW
+		d_value[k].write(data_vk[j*k]);
+		d_key[k].write(data_vk[j*k]);
+		d_query[k].write(data_q[j*k]);
+		}
+	}
+}
+
 
 template<class data_T, class res_T, typename CONFIG_T>
 void multiheadattention(
@@ -202,47 +312,60 @@ void multiheadattention(
     typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
 {
-
+	hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
+	hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
+	hls::stream<data_T> d_key[CONFIG_T::num_heads][CONFIG_T::feature_dim];
 	hls::stream<data_T> q_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_key];
-    data_T v_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
+	hls::stream<data_T> v_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
+    data_T v_reshape[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
     data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
     data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
-//    data_T mat_res_con[CONFIG_T::num_heads*CONFIG_T::head_dim_value];
-//    data_T dense_in[CONFIG_T::seq_len][CONFIG_T::num_heads * CONFIG_T::head_dim_value];
     hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
 
 #pragma HLS DATAFLOW
 #pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
 #pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
 #pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+#pragma HLS ARRAY_PARTITION variable=v_reshape complete dim=1
 #pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
 #pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
     // std::cout << "input to MHA: " << std::endl;
     // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
     // std::cout << " " << std::endl;
 
+    for (int i=0;i<CONFIG_T::num_heads; ++i){
+#pragma HLS UNROLL
+    	data_prep<data_T, res_T, CONFIG_T>(data_q, data_vk, d_value[i], d_query[i], d_key[i]);
+	}
+
+
     // linear projection
     for (int i=0;i<CONFIG_T::num_heads; ++i){
 #pragma HLS UNROLL
-    	dense_value<data_T, res_T, CONFIG_T>(data_vk, v_proj[i], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+    	dense_value<data_T, res_T, CONFIG_T>(d_value[i], v_proj[i], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
     }
     for (int i=0;i<CONFIG_T::num_heads; ++i){
 #pragma HLS UNROLL
-    	dense_query<data_T, res_T, CONFIG_T>(data_q, q_proj[i], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+    	dense_query<data_T, res_T, CONFIG_T>(d_query[i], q_proj[i], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
     }
     for (int i=0;i<CONFIG_T::num_heads; ++i){
 #pragma HLS UNROLL
-    dense_key<data_T, res_T, CONFIG_T>(data_vk, k_proj[i], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i));
+    dense_key<data_T, res_T, CONFIG_T>(d_key[i], k_proj[i], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i));
     }
     
-    nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[0], k_proj[0], qk_mul[0]);
-    nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[1], k_proj[1], qk_mul[1]);
-    nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[2], k_proj[2], qk_mul[2]);
+    value_reshape: for (int i=0; i < CONFIG_T::num_heads; ++i){
+#pragma HLS UNROLL
+    	nnet::value_prep<data_T, res_T, CONFIG_T>(v_proj[i], v_reshape[i]);
+    }
 
+    maxtrixmul1: for (int i=0; i < CONFIG_T::num_heads; ++i){
+#pragma HLS UNROLL
+    	nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+    }
 
     maxtrixmul2: for (int i=0; i < CONFIG_T::num_heads; ++i){
 #pragma HLS UNROLL
-    	nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]);//stream
+    	nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_reshape[i], matr_out[i]);//stream
     }
 
     dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
@@ -252,3 +375,8 @@ void multiheadattention(
 }
 
 #endif
+
+
+
+
+

From 97e71e931ab41aa0031a81901113e22608bf592b Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Sun, 26 Feb 2023 18:43:12 -0800
Subject: [PATCH 25/55] MHA updated

---
 .../vivado/passes/resource_strategy.py        |   9 +-
 .../vivado/passes/transformer_templates.py    |   2 +
 hls4ml/backends/vivado/vivado_backend.py      |   2 +-
 .../vivado/nnet_utils/nnet_dense_seq.h        |  19 +-
 .../vivado/nnet_utils/nnet_layernorm.h        |  52 +++--
 .../nnet_utils/nnet_multiheadattention.h      | 216 +++++++++---------
 6 files changed, 158 insertions(+), 142 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py
index b1c28bcfbc..daba61a6b0 100644
--- a/hls4ml/backends/vivado/passes/resource_strategy.py
+++ b/hls4ml/backends/vivado/passes/resource_strategy.py
@@ -41,10 +41,11 @@ def transform(self, model, node):
             node.weights['weight'].data = np.transpose(node.weights['weight'].data)
             node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
         elif isinstance(node, (MultiHeadAttention)):               
-            node.weights['key_weight'].data   = np.transpose(node.weights['key_weight'].data,   axes=[0, 2, 1])
-            node.weights['query_weight'].data = np.transpose(node.weights['query_weight'].data, axes=[0, 2, 1])
-            node.weights['value_weight'].data = np.transpose(node.weights['value_weight'].data, axes=[0, 2, 1])
-            node.weights['attention_output_weight'].data = np.transpose(node.weights['attention_output_weight'].data, axes=[2, 0, 1])
+        #     node.weights['key_weight'].data   = np.transpose(node.weights['key_weight'].data,   axes=[0, 2, 1])
+        #     node.weights['query_weight'].data = np.transpose(node.weights['query_weight'].data, axes=[0, 2, 1])
+        #     node.weights['value_weight'].data = np.transpose(node.weights['value_weight'].data, axes=[0, 2, 1])
+        #     node.weights['attention_output_weight'].data = np.transpose(node.weights['attention_output_weight'].data, axes=[2, 0, 1])
+            print("not transpose")
         else:
             raise Exception(f'Unexpected layer {node.class_name} with resource strategy')
 
diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 8881a496c6..96aad94625 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -80,6 +80,7 @@ def format(self, node):
         mha_config = self.template.format(**params)
 
         mult_params1 = self._default_config_params(node)
+        mult_params1['strategy'] = 'latency'
         mult_params1['mNum'] = '1'
         mult_params1['n_in'] = node.get_attr('feature_dim')
         mult_params1['n_out'] = node.get_attr('head_dim_key')
@@ -92,6 +93,7 @@ def format(self, node):
         mult_config1 = self.mult1_template.format(**mult_params1)
 
         mult_params2 = self._default_config_params(node)
+        mult_params2['strategy'] = 'latency'
         mult_params2['mNum'] = '2'
         mult_params2['n_in'] = node.get_attr('head_dim_value') * node.get_attr('num_heads')
         mult_params2['n_out'] = node.get_attr('feature_dim')
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index d2cfd79bfc..5337d7268b 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -516,7 +516,7 @@ def init_layernormalization(self, layer):
         if 'table_t' not in layer.attributes:
             layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=5)))
         if 'table_size' not in layer.attributes:
-            layer.set_attr('table_size', 2048)
+            layer.set_attr('table_size', 2048)  #table size
 
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
index 7dc21f5d4c..e791276326 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
@@ -17,16 +17,29 @@ void dense_seq(
     typename CONFIG_T::bias_t    biases[CONFIG_T::n_out])
 {
     #pragma HLS inline
+
+    data_T in_val[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=in_val complete
+
     if (CONFIG_T::strategy == nnet::latency) {
         for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense_latency<data_T, res_T, CONFIG_T>(data+(CONFIG_T::n_in*j), res+(CONFIG_T::n_out*j), weights, biases);
+		#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            for (int i=0; i < CONFIG_T::n_in; ++i){
+            #pragma HLS UNROLL
+                in_val[i] = data[j*CONFIG_T::n_in+i];
+            }
+            dense_latency<data_T, res_T, CONFIG_T>(in_val, res+(CONFIG_T::n_out*j), weights, biases);
         }
     } else {
         for (int j=0; j <CONFIG_T::seq_len; ++j){
-            dense_resource<data_T, res_T, CONFIG_T>(data+(CONFIG_T::n_in*j), res+(CONFIG_T::n_out*j), weights, biases);
+		#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            for (int i=0; i < CONFIG_T::n_in; ++i){
+            #pragma HLS UNROLL
+                in_val[i] = data[j*CONFIG_T::n_in+i];
+            }
+            dense_resource<data_T, res_T, CONFIG_T>(in_val, res+(CONFIG_T::n_out*j), weights, biases);
         }
     }
-
 }
 
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index d49053d083..a5fb265215 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -17,8 +17,8 @@
 //    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 //
 
-#ifndef NNET_BATCHNORM_H_
-#define NNET_BATCHNORM_H_
+#ifndef NNET_LAYERNORM_H_
+#define NNET_LAYERNORM_H_
 
 #include "nnet_common.h"
 #include "nnet_dense.h"
@@ -51,11 +51,11 @@ struct layernorm_config
 template<typename CONFIG_T, int N_TABLE>
 void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
 {
-    float inv_range = 0.01;
+    float inv_range = 0.5; /// if not acurrate increase this
     // Inversion function:
     //   result = 1/sqrt(x)
     for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +2)
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
         float in_val = inv_range*ii/float(N_TABLE);
         // Next, compute lookup table function
         if (in_val > 0.0) table_out[ii] = 1.0/sqrt(in_val);
@@ -70,9 +70,12 @@ void layernorm_1d(
     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
 )
-{   
+{
+#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+#pragma HLS ARRAY_PARTITION variable=data complete
+#pragma HLS ARRAY_PARTITION variable=res complete
 
-int inv_range_inv = (int) 1/0.01;
+int inv_range_inv = (int) 1/0.5;  /// if not acurrate increase this
 typename CONFIG_T::table_t deno_inver = 0;
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -92,6 +95,9 @@ typename CONFIG_T::table_t deno_inver = 0;
     data_T var, mean, diff;
     data_T data_diff[dim];
     data_T data_norm[dim];
+
+    #pragma HLS ARRAY_PARTITION variable=data_diff complete
+    #pragma HLS ARRAY_PARTITION variable=data_diff complete
     
     const data_T k_inv = 1.0/dim;
     for (int i = 0; i < dim; ++i){
@@ -129,32 +135,38 @@ typename CONFIG_T::table_t deno_inver = 0;
 
 }
 
-
 template<class data_T, class res_T, typename CONFIG_T>
 void layernormalize(
     data_T    data[CONFIG_T::n_in],
     res_T     res[CONFIG_T::n_in],
-    typename CONFIG_T::scale_t  scale[CONFIG_T::n_scale_bias],
-    typename CONFIG_T::bias_t   bias[CONFIG_T::n_scale_bias]
+    typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+    typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
 )
 {
-    data_T cache;
     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-
+    data_T in_val[dim];
     // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
     #pragma HLS function_instantiate variable=scale,bias
-
-    // For parallel inputs:
-    //   - completely partition arrays -- target fabric
-    //   - if we have an unroll factor, limit number of multipliers
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
+    
     // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
     #pragma HLS ARRAY_PARTITION variable=scale complete
     #pragma HLS ARRAY_PARTITION variable=bias complete
-
-    for (int j=0; j <CONFIG_T::seq_len; ++j){
-        layernorm_1d<data_T, res_T, CONFIG_T>(data+(dim*j), res+(dim*j), scale, bias);
+    #pragma HLS ARRAY_PARTITION variable=in_val complete
+
+    if (dim == 1) {
+        for (int j=0; j <CONFIG_T::n_in; ++j){
+        #pragma HLS UNROLL
+            res[j] = bias[0];
+        }
+    } else {
+        for (int j=0; j <CONFIG_T::seq_len; ++j){
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            for (int i=0; i < dim; ++i){
+            #pragma HLS UNROLL
+                in_val[i] = data[j*dim+i];
+            }
+            layernorm_1d<data_T, res_T, CONFIG_T>(in_val, res+(dim*j), scale, bias);
+        }
     }
 
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 706aec0929..ac27550b10 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -38,7 +38,7 @@ struct multiheadattention_config
 template<class data_T, class res_T, typename CONFIG_T>
 void matrixmul_transpose(
 	hls::stream<data_T>  Q[CONFIG_T::head_dim_key],
-	data_T  K[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
+	hls::stream<data_T>  K[CONFIG_T::head_dim_key],
     res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
     const data_T dk = 1.0/sqrt(CONFIG_T::head_dim_key);
@@ -46,26 +46,34 @@ void matrixmul_transpose(
     data_T Qi[CONFIG_T::head_dim_key];
     data_T Product[CONFIG_T::seq_len];
     data_T qk_smout[CONFIG_T::seq_len];
-#pragma HLS ARRAY_RESHAPE variable=K cyclic factor=4 dim=1
-#pragma HLS ARRAY_RESHAPE variable=K complete dim=2
-#pragma HLS ARRAY_RESHAPE variable=Qi complete dim=0
-#pragma HLS ARRAY_PARTITION variable=Product complete
-#pragma HLS ARRAY_PARTITION variable=qk_smout complete
-#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
+    data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
+	#pragma HLS ARRAY_PARTITION variable=K complete dim=1   /////
+	#pragma HLS ARRAY_PARTITION variable=Qi complete
+	#pragma HLS ARRAY_PARTITION variable=Product complete
+	#pragma HLS ARRAY_PARTITION variable=qk_smout complete
+	#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
+	#pragma HLS ARRAY_PARTITION variable=krow complete
+
+    prep_k: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    	for(int j = 0; j < CONFIG_T::head_dim_key; ++j) {
+		#pragma HLS UNROLL
+    		krow[i*CONFIG_T::head_dim_key + j] = K[j].read();
+    	}
+    }
 
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
     	q: for(int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i){
 		#pragma HLS UNROLL
     		Qi[q_i]=Q[q_i].read();
     	}
         col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
-		#pragma HLS PIPELINE
-		#pragma HLS UNROLL factor=8
             // compute (QK)i,j
             QKij = 0;
             product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-            	QK_1 = Qi[k] * K[j][k];
+            	QK_1 = Qi[k] * krow[j*CONFIG_T::head_dim_key + k];
                 QKij += QK_1;
             }
             Product[j] = QKij * dk;
@@ -78,29 +86,43 @@ void matrixmul_transpose(
     }
 }
 
-
+/////////
 template<class data_T, class res_T, typename CONFIG_T>
 void matrixmul(
     data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len],
-    data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value],
-	hls::stream<data_T>   S[CONFIG_T::head_dim_value]) // S: attention score
+	hls::stream<data_T>  V[CONFIG_T::head_dim_value],
+	hls::stream<data_T>  S[CONFIG_T::head_dim_value]) // S: attention score
 {
-//	data_T Product[CONFIG_T::head_dim_value];
 	#pragma HLS ARRAY_Partition variable=QK complete dim=2
 	#pragma HLS ARRAY_Partition variable=S complete dim=1
-    #pragma HLS ARRAY_RESHAPE variable=V complete dim=1
-//	#pragma HLS ARRAY_PARTITION variable=Product complete
-//	#pragma HLS ARRAY_Partition variable=V cyclic factor=2 dim=2 //
+    #pragma HLS ARRAY_Partition variable=V complete dim=1
+
+	data_T dataV[CONFIG_T::seq_len*CONFIG_T::head_dim_value];
+	# pragma HLS ARRAY_PARTITION variable=dataV complete dim=1
+	for (int j=0; j <CONFIG_T::seq_len; ++j){
+	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+		for (int i=0; i <CONFIG_T::head_dim_value; ++i){
+		#pragma HLS UNROLL
+			dataV[CONFIG_T::seq_len*i+j]=V[i].read();
+		}
+	}
+
     // for each row and column of AB
-    data_T Sij;
+    data_T Sij, S_1;
+    data_T QKi[CONFIG_T::seq_len];
+	#pragma HLS ARRAY_Partition variable=QKi complete
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
+	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    	qk: for(int q_i = 0; q_i < CONFIG_T::seq_len; ++q_i){
+		#pragma HLS UNROLL
+			QKi[q_i]=QK[i][q_i];
+		}
         col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-#pragma HLS PIPELINE
-#pragma HLS UNROLL factor=2
             // compute (S)i,j
             Sij = 0;
             product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
-                Sij += QK[i][k] * V[k][j];
+            	S_1 = QKi[k] * dataV[j*CONFIG_T::seq_len + k];
+            	Sij += S_1;
             }
             S[j].write(Sij);
         }
@@ -114,8 +136,8 @@ void dense_value(
     typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
     typename CONFIG_T::bias_t    value_bias[CONFIG_T::head_dim_value])
 {
-#pragma HLS ARRAY_PARTITION variable=data_v complete dim=1
-#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=data_v complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
 
     v_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
 	#pragma HLS DATAFLOW
@@ -123,7 +145,7 @@ void dense_value(
     	data_T dense_in[CONFIG_T::feature_dim];
 		#pragma HLS ARRAY_PARTITION variable=v_row complete dim=1
 		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
-for (int k=0; k<CONFIG_T::feature_dim; ++k){
+        for (int k=0; k<CONFIG_T::feature_dim; ++k){
 		#pragma HLS UNROLL
     		dense_in[k] = data_v[k].read();
     	}
@@ -134,63 +156,32 @@ for (int k=0; k<CONFIG_T::feature_dim; ++k){
         }
     }
 }
-//
+
+
+
 //template<class data_T, class res_T, typename CONFIG_T>
 //void value_prep(
 //	hls::stream<data_T>    v_proj[CONFIG_T::head_dim_value],
-//	data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value])
+//	data_T  V[CONFIG_T::head_dim_value * CONFIG_T::seq_len])
 //{
-//	data_T col[CONFIG_T::head_dim_value][CONFIG_T::seq_len];
-//	#pragma HLS ARRAY_PARTITION variable=col complete dim=1
-//	#pragma HLS ARRAY_PARTITION variable=col complete dim=2
-//
-//	for (int i=0; i <CONFIG_T::seq_len; ++i){
-//		for (int j=0; j <CONFIG_T::head_dim_value; ++j){
+//	data_T data[CONFIG_T::seq_len*CONFIG_T::head_dim_value];
+//	# pragma HLS ARRAY_PARTITION variable=V complete dim=1
+//	# pragma HLS ARRAY_PARTITION variable=data complete dim=1
+//	for (int j=0; j <CONFIG_T::seq_len; ++j){
+//		for (int i=0; i <CONFIG_T::head_dim_value; ++i){
 //		#pragma HLS UNROLL
-//			col[j][i]=v_proj[j].read();
+//			data[CONFIG_T::seq_len*i+j]=v_proj[i].read();
 //		}
 //	}
 //	for (int i=0; i <CONFIG_T::head_dim_value; ++i){
 //		for (int j=0; j <CONFIG_T::seq_len; ++j){
 //		#pragma HLS UNROLL
-//			V[j][i]=col[i][j];
+//			V[CONFIG_T::seq_len*i +j]=data[CONFIG_T::seq_len*i+j];
 //		}
 //	}
 //}
 
 
-template<class data_T, class res_T, typename CONFIG_T>
-void value_prep(
-	hls::stream<data_T>    v_proj[CONFIG_T::head_dim_value],
-	data_T  V[CONFIG_T::seq_len][CONFIG_T::head_dim_value])
-{
-//	data_T col[CONFIG_T::head_dim_value][CONFIG_T::seq_len];
-//	#pragma HLS ARRAY_PARTITION variable=col complete dim=1
-//	#pragma HLS ARRAY_PARTITION variable=col complete dim=2
-
-//	for (int i=0; i <CONFIG_T::seq_len; ++i){
-//		for (int j=0; j <CONFIG_T::head_dim_value; ++j){
-//		#pragma HLS UNROLL
-//			col[j][i]=v_proj[j].read();
-//		}
-//	}
-	data_T data[CONFIG_T::seq_len*CONFIG_T::head_dim_value];
-	# pragma HLS ARRAY_PARTITION variable=col complete dim=1
-	for (int i=0; i <CONFIG_T::head_dim_value; ++i){
-	#pragma HLS UNROLL
-		for (int j=0; j <CONFIG_T::seq_len; ++j){
-			data[CONFIG_T::seq_len*i+j]=v_proj[i].read();
-		}
-	}
-	for (int i=0; i <CONFIG_T::head_dim_value; ++i){
-		for (int j=0; j <CONFIG_T::seq_len; ++j){
-		#pragma HLS UNROLL
-			V[j][i]=data[CONFIG_T::seq_len*i+j];
-		}
-	}
-}
-
-
 
 template<class data_T, class res_T, typename CONFIG_T>
 void dense_query(
@@ -199,8 +190,8 @@ void dense_query(
     typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
     typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_key])
 {
-#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
-#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
 
     q_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
 	#pragma HLS DATAFLOW
@@ -209,10 +200,10 @@ void dense_query(
 		#pragma HLS ARRAY_PARTITION variable=proj complete dim=1
 		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
 
-	for (int k=0; k<CONFIG_T::feature_dim; ++k){
-	#pragma HLS UNROLL
-		dense_in[k] = data_q[k].read();
-	}
+		for (int k=0; k<CONFIG_T::feature_dim; ++k){
+		#pragma HLS UNROLL
+			dense_in[k] = data_q[k].read();
+		}
         dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, proj, query_weight, query_bias);
         update_proj: for (int i=0; i<CONFIG_T::head_dim_key; ++i){
 		#pragma HLS UNROLL
@@ -224,28 +215,28 @@ void dense_query(
 template<class data_T, class res_T, typename CONFIG_T>
 void dense_key(
 	hls::stream<data_T>    data_k[CONFIG_T::feature_dim],
-    data_T    k_proj[CONFIG_T::seq_len][CONFIG_T::head_dim_key],
+	hls::stream<data_T>    k_proj[CONFIG_T::head_dim_key],
     typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
     typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_key])
 {
-#pragma HLS ARRAY_PARTITION variable=data_k complete dim=1
-#pragma HLS ARRAY_RESHAPE variable=k_proj complete dim=2
-//    #pragma HLS function_instantiate variable=key_weight,key_bias
+	#pragma HLS ARRAY_PARTITION variable=data_k complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+
     k_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
 	#pragma HLS DATAFLOW
-    	data_T k_row[CONFIG_T::head_dim_key];
+    	data_T proj[CONFIG_T::head_dim_key];
     	data_T dense_in[CONFIG_T::feature_dim];
-		#pragma HLS ARRAY_PARTITION variable=k_row complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=proj complete dim=1
 		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
+
 		for (int k=0; k<CONFIG_T::feature_dim; ++k){
 		#pragma HLS UNROLL
 			dense_in[k] = data_k[k].read();
 		}
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, k_row, key_weight, key_bias);
-        //k_proj[j] = k_row;
-        for (int k=0; k <CONFIG_T::head_dim_key; ++k){
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, proj, key_weight, key_bias);
+        update_proj: for (int i=0; i <CONFIG_T::head_dim_key; ++i){
 		#pragma HLS UNROLL
-        	k_proj[j][k]=k_row[k];
+        	k_proj[i].write(proj[i]);
 		}
     }
 }
@@ -258,10 +249,11 @@ void dense_out(
     typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],
     typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::feature_dim])
 {
-//    #pragma HLS function_instantiate variable=query_weight,query_bias
 	data_T mat_res_con[CONFIG_T::num_heads*CONFIG_T::head_dim_value];
 	#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
 	output_dense: for (int k=0; k <CONFIG_T::seq_len; ++k){
+	#pragma HLS DATAFLOW
+//	#pragma HLS PIPELINE
 		for (int i=0;i<CONFIG_T::num_heads; ++i){
 		#pragma HLS UNROLL
 			for (int j=0;j<CONFIG_T::head_dim_value; ++j){
@@ -282,9 +274,9 @@ void data_prep(
 	hls::stream<data_T> d_query[CONFIG_T::feature_dim],
 	hls::stream<data_T> d_key[CONFIG_T::feature_dim])
 {
-#pragma HLS ARRAY_PARTITION variable=d_value complete dim=1
-#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
-#pragma HLS ARRAY_PARTITION variable=d_key complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=d_value complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=d_key complete dim=1
 
 	for (int j=0; j<CONFIG_T::seq_len; ++j){
 		#pragma HLS PIPELINE
@@ -316,56 +308,56 @@ void multiheadattention(
 	hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
 	hls::stream<data_T> d_key[CONFIG_T::num_heads][CONFIG_T::feature_dim];
 	hls::stream<data_T> q_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_key];
+	hls::stream<data_T> k_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_key];
 	hls::stream<data_T> v_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
-    data_T v_reshape[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_value];
-    data_T k_proj[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::head_dim_key];
+//    data_T v_reshape[CONFIG_T::num_heads][CONFIG_T::head_dim_value][CONFIG_T::seq_len];
     data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
     hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
 
-#pragma HLS DATAFLOW
-#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
-#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
-#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
-#pragma HLS ARRAY_PARTITION variable=v_reshape complete dim=1
-#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
-#pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
+	#pragma HLS DATAFLOW
+	#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+//	#pragma HLS ARRAY_PARTITION variable=v_reshape complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
     // std::cout << "input to MHA: " << std::endl;
     // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
     // std::cout << " " << std::endl;
 
-    for (int i=0;i<CONFIG_T::num_heads; ++i){
-#pragma HLS UNROLL
+    dataprep: for (int i=0;i<CONFIG_T::num_heads; ++i){
+	#pragma HLS UNROLL
     	data_prep<data_T, res_T, CONFIG_T>(data_q, data_vk, d_value[i], d_query[i], d_key[i]);
 	}
 
 
     // linear projection
-    for (int i=0;i<CONFIG_T::num_heads; ++i){
-#pragma HLS UNROLL
+    d_value: for (int i=0;i<CONFIG_T::num_heads; ++i){
+	#pragma HLS UNROLL
     	dense_value<data_T, res_T, CONFIG_T>(d_value[i], v_proj[i], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
     }
-    for (int i=0;i<CONFIG_T::num_heads; ++i){
-#pragma HLS UNROLL
+    d_query: for (int i=0;i<CONFIG_T::num_heads; ++i){
+	#pragma HLS UNROLL
     	dense_query<data_T, res_T, CONFIG_T>(d_query[i], q_proj[i], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
     }
-    for (int i=0;i<CONFIG_T::num_heads; ++i){
-#pragma HLS UNROLL
+    d_key: for (int i=0;i<CONFIG_T::num_heads; ++i){
+	#pragma HLS UNROLL
     dense_key<data_T, res_T, CONFIG_T>(d_key[i], k_proj[i], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i));
     }
     
-    value_reshape: for (int i=0; i < CONFIG_T::num_heads; ++i){
-#pragma HLS UNROLL
-    	nnet::value_prep<data_T, res_T, CONFIG_T>(v_proj[i], v_reshape[i]);
-    }
+//    value_reshape: for (int i=0; i < CONFIG_T::num_heads; ++i){
+//	#pragma HLS UNROLL
+//    	nnet::value_prep<data_T, res_T, CONFIG_T>(v_proj[i], v_reshape[i]);
+//    }
 
     maxtrixmul1: for (int i=0; i < CONFIG_T::num_heads; ++i){
-#pragma HLS UNROLL
+	#pragma HLS UNROLL
     	nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
     }
 
     maxtrixmul2: for (int i=0; i < CONFIG_T::num_heads; ++i){
-#pragma HLS UNROLL
-    	nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_reshape[i], matr_out[i]);//stream
+	#pragma HLS UNROLL
+    	nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]);//stream
     }
 
     dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
@@ -376,7 +368,3 @@ void multiheadattention(
 
 #endif
 
-
-
-
-

From 5ed4a766f50fbeded3b0cb071e793f6a59b7f7fc Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Mon, 3 Apr 2023 21:00:03 -0700
Subject: [PATCH 26/55] LayerNorm_bug_fix

---
 .../backends/vivado/passes/core_templates.py  |   1 +
 hls4ml/backends/vivado/vivado_backend.py      |   3 +
 hls4ml/templates/vivado/#vivado_synth.tcl#    |   6 +
 .../vivado/nnet_utils/nnet_layernorm.h        | 178 ++++++++++-
 .../nnet_utils/nnet_multiheadattention.h      | 297 ++++++++----------
 hls4ml/templates/vivado/vivado_synth.tcl      |   2 +-
 6 files changed, 309 insertions(+), 178 deletions(-)
 create mode 100644 hls4ml/templates/vivado/#vivado_synth.tcl#

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 772492f68b..f0090dcb2a 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -113,6 +113,7 @@ def format(self, node):
     static const unsigned n_in = {n_in};
     static const unsigned seq_len = {seq_len};
     static const unsigned table_size = {table_size};
+    static constexpr double table_range = {table_range};
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
     static const bool store_weights_in_bram = false;
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 5337d7268b..dffc3f77f5 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -517,6 +517,9 @@ def init_layernormalization(self, layer):
             layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=5)))
         if 'table_size' not in layer.attributes:
             layer.set_attr('table_size', 2048)  #table size
+        if 'table_range' not in layer.attributes:
+            layer.set_attr('table_range', 1.0)  #table range
+        
 
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
diff --git a/hls4ml/templates/vivado/#vivado_synth.tcl# b/hls4ml/templates/vivado/#vivado_synth.tcl#
new file mode 100644
index 0000000000..96bd21c672
--- /dev/null
+++ b/hls4ml/templates/vivado/#vivado_synth.tcl#
@@ -0,0 +1,6 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+add_files ${project_name}_prj/solution1/syn/verilog
+synth_design -top ${project_name} -part $part
+report_utilization -file vivado_synth.rpt
\ No newline at end of file
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index a5fb265215..f975486b68 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -26,6 +26,9 @@
 #include <math.h>
 #include <iostream>
 
+#include "hls_math.h"
+// #include "ap_fixed.h"
+
 namespace nnet {
 
 struct layernorm_config
@@ -51,7 +54,7 @@ struct layernorm_config
 template<typename CONFIG_T, int N_TABLE>
 void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
 {
-    float inv_range = 0.5; /// if not acurrate increase this
+    float inv_range = CONFIG_T::table_range;
     // Inversion function:
     //   result = 1/sqrt(x)
     for (int ii = 0; ii < N_TABLE; ii++) {
@@ -63,6 +66,88 @@ void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
     }
 }
 
+template<typename CONFIG_T, int N_TABLE>
+void init_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
+{
+    float inv_range = 0.5; /// if not acurrate increase this
+    // Inversion function:
+    //   result = 1/sqrt(x)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
+        float in_val = inv_range*ii/float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0) table_out[ii] = sqrt(in_val);
+        else table_out[ii] = 0.0;
+    }
+}
+
+
+
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+// )
+// {
+// #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+// #pragma HLS ARRAY_PARTITION variable=data complete
+// #pragma HLS ARRAY_PARTITION variable=res complete
+
+// int inv_range_inv = (int) 1/ 0.5; 
+// typename CONFIG_T::table_t sqr = 0;
+// #ifdef __HLS_SYN__
+//     bool initialized = false;
+//     typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
+// #else
+//     static bool initialized = false;
+//     static typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
+// #endif
+//     if (!initialized) {
+//         init_sqr_table<CONFIG_T, CONFIG_T::table_size>(sqr_table);
+//         initialized = true;
+//     }
+
+//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//     data_T sum_cache = 0;
+//     data_T sum_cache2 = 0; 
+//     data_T var, mean, diff, inv_sqr;
+//     data_T data_diff[dim];
+//     data_T data_norm[dim];
+
+//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
+//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
+    
+//     const data_T k_inv = 1.0/dim;
+//     for (int i = 0; i < dim; ++i){
+//         sum_cache += data[i];
+//     }
+//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+    
+//     for (int i = 0; i < dim; ++i){
+//         data_diff[i] = data[i] - mean;
+//         diff = data_diff[i]*data_diff[i];
+//         sum_cache2 += diff;
+//     }
+//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+
+//     int index = var*(CONFIG_T::table_size)*inv_range_inv;
+// 	if (index < 0)   index = 0;
+// 	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+// 	sqr = (typename CONFIG_T::table_t) sqr_table[index];
+//     inv_sqr = 1 / sqr;
+ 
+
+
+//     for (int i = 0; i < dim; ++i){
+//         res[i] = data_diff[i] * inv_sqr * scale[i] + bias[i];
+//     }
+
+// }
+
+
+
 template<class data_T, class res_T, typename CONFIG_T>
 void layernorm_1d(
     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
@@ -75,7 +160,7 @@ void layernorm_1d(
 #pragma HLS ARRAY_PARTITION variable=data complete
 #pragma HLS ARRAY_PARTITION variable=res complete
 
-int inv_range_inv = (int) 1/0.5;  /// if not acurrate increase this
+int inv_range_inv = (int) 1/ CONFIG_T::table_range;
 typename CONFIG_T::table_t deno_inver = 0;
 #ifdef __HLS_SYN__
     bool initialized = false;
@@ -121,6 +206,8 @@ typename CONFIG_T::table_t deno_inver = 0;
     // std::cout << " " << std::endl;
 
     int index = var*(CONFIG_T::table_size)*inv_range_inv;
+    if (CONFIG_T::table_range > 1) index = var*(CONFIG_T::table_size)/ (int)CONFIG_T::table_range;
+
 	if (index < 0)   index = 0;
 	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
 	deno_inver = (typename CONFIG_T::table_t) invert_sqr_table[index];
@@ -135,6 +222,67 @@ typename CONFIG_T::table_t deno_inver = 0;
 
 }
 
+
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+// )
+// {
+// #pragma HLS PIPELINE
+// #pragma HLS ARRAY_PARTITION variable=data complete
+// #pragma HLS ARRAY_PARTITION variable=res complete
+
+
+//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//     data_T sum_cache = 0;
+//     data_T sum_cache2 = 0; 
+//     data_T var, mean, diff_squares, diff, var_eps_inv;
+//     data_T data_diff[dim];
+//     float sqrt_var_eps;
+
+//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
+    
+//     const data_T k_inv = 1.0/dim;
+//     for (int i = 0; i < dim; ++i){
+//         sum_cache += data[i];
+//     }
+//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+//     // std::cout << "mean: " << std::endl;
+//     // std::cout << mean << std::endl;
+    
+//     for (int i = 0; i < dim; ++i){
+//         diff = data[i] - mean;
+//         data_diff[i] = diff;
+//         diff_squares = diff*diff;
+//         sum_cache2 += diff_squares;
+//         // std::cout << "data_diff: " << std::endl;
+//         // std::cout << data_diff[i] << std::endl;
+//         // std::cout << " " << std::endl;
+//     }
+//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+//     float var_f = (float)var;
+//     // std::cout << "var: ";
+//     // std::cout << var << std::endl;
+
+//     sqrt_var_eps = sqrt(var_f);
+//     var_eps_inv = (data_T) (1 / (sqrt_var_eps));
+//     // std::cout << "var_eps_inv: " << std::endl;
+//     // std::cout << var_eps_inv << std::endl;
+//     // std::cout << " " << std::endl;
+
+
+//     for (int i = 0; i < dim; ++i){
+//         res[i] = data_diff[i] * var_eps_inv * scale[i] + bias[i];
+//     }
+
+// }
+
+
+
+
 template<class data_T, class res_T, typename CONFIG_T>
 void layernormalize(
     data_T    data[CONFIG_T::n_in],
@@ -145,6 +293,7 @@ void layernormalize(
 {
     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
     data_T in_val[dim];
+    data_T outval[dim];
     // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
     #pragma HLS function_instantiate variable=scale,bias
     
@@ -152,24 +301,23 @@ void layernormalize(
     #pragma HLS ARRAY_PARTITION variable=scale complete
     #pragma HLS ARRAY_PARTITION variable=bias complete
     #pragma HLS ARRAY_PARTITION variable=in_val complete
+    #pragma HLS ARRAY_PARTITION variable=outval complete
 
-    if (dim == 1) {
-        for (int j=0; j <CONFIG_T::n_in; ++j){
+    // std::cout << "one seq norm layer: " << std::endl;
+    // std::cout << " " << std::endl;
+    
+    for (int j=0; j <CONFIG_T::seq_len; ++j){
+    #pragma HLS PIPELINE
+        load: for (int i=0; i < dim; ++i){
         #pragma HLS UNROLL
-            res[j] = bias[0];
+            in_val[i] = data[j*dim+i];
         }
-    } else {
-        for (int j=0; j <CONFIG_T::seq_len; ++j){
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-            for (int i=0; i < dim; ++i){
-            #pragma HLS UNROLL
-                in_val[i] = data[j*dim+i];
-            }
-            layernorm_1d<data_T, res_T, CONFIG_T>(in_val, res+(dim*j), scale, bias);
+        layernorm_1d<data_T, res_T, CONFIG_T>(in_val, outval, scale, bias);
+        store: for (int i=0; i < dim; ++i){
+        #pragma HLS UNROLL
+            res[j*dim+i] = outval[i];
         }
     }
-
-
 }
 
 }
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index ac27550b10..0ed0e0e0a4 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -35,45 +35,75 @@ struct multiheadattention_config
     using product = nnet::product::mult<x_T, y_T>;
 };
 
+template<int PackSize, class data_T>
+struct datapack {
+    data_T data[PackSize];
+};
+
+
+template <class data_T,int size>
+void read_stream_array(
+	hls::stream<data_T>    data_in[size],
+	data_T out[size]
+)
+{
+	for (int k=0; k<size; ++k){
+	#pragma HLS UNROLL
+		out[k] = data_in[k].read();
+	}
+}
+
+
 template<class data_T, class res_T, typename CONFIG_T>
 void matrixmul_transpose(
-	hls::stream<data_T>  Q[CONFIG_T::head_dim_key],
-	hls::stream<data_T>  K[CONFIG_T::head_dim_key],
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
     res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
     const data_T dk = 1.0/sqrt(CONFIG_T::head_dim_key);
     data_T QKij, QK_1;
     data_T Qi[CONFIG_T::head_dim_key];
-    data_T Product[CONFIG_T::seq_len];
+    data_T Product[CONFIG_T::seq_len];// seq_Q, seq_K
     data_T qk_smout[CONFIG_T::seq_len];
     data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
-	#pragma HLS ARRAY_PARTITION variable=K complete dim=1   /////
 	#pragma HLS ARRAY_PARTITION variable=Qi complete
 	#pragma HLS ARRAY_PARTITION variable=Product complete
 	#pragma HLS ARRAY_PARTITION variable=qk_smout complete
 	#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
 	#pragma HLS ARRAY_PARTITION variable=krow complete
 
+	datapack<CONFIG_T::head_dim_key, data_T> datak_pack, dataq_pack;
+	#pragma HLS DATA_PACK variable=Q
+	#pragma HLS DATA_PACK variable=K
+	#pragma HLS DATA_PACK variable=datak_pack
+	#pragma HLS DATA_PACK variable=dataq_pack
+
+    int multiplier_limit  = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
+    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+
     prep_k: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
 	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    	datak_pack = K.read();
     	for(int j = 0; j < CONFIG_T::head_dim_key; ++j) {
 		#pragma HLS UNROLL
-    		krow[i*CONFIG_T::head_dim_key + j] = K[j].read();
+    		krow[i*CONFIG_T::head_dim_key + j] = datak_pack.data[j];
     	}
     }
 
     // for each row and column of AB
     row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
 	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    	dataq_pack = Q.read();
+
     	q: for(int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i){
 		#pragma HLS UNROLL
-    		Qi[q_i]=Q[q_i].read();
+    		Qi[q_i]=dataq_pack.data[q_i];
     	}
         col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
             // compute (QK)i,j
             QKij = 0;
             product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-            	QK_1 = Qi[k] * krow[j*CONFIG_T::head_dim_key + k];
+            	QK_1 = CONFIG_T::template product<data_T, data_T>::product(Qi[k],krow[j*CONFIG_T::head_dim_key + k]);
                 QKij += QK_1;
             }
             Product[j] = QKij * dk;
@@ -90,20 +120,28 @@ void matrixmul_transpose(
 template<class data_T, class res_T, typename CONFIG_T>
 void matrixmul(
     data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len],
-	hls::stream<data_T>  V[CONFIG_T::head_dim_value],
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>>  &V,
 	hls::stream<data_T>  S[CONFIG_T::head_dim_value]) // S: attention score
 {
-	#pragma HLS ARRAY_Partition variable=QK complete dim=2
-	#pragma HLS ARRAY_Partition variable=S complete dim=1
-    #pragma HLS ARRAY_Partition variable=V complete dim=1
+	#pragma HLS DATA_PACK variable=V
+	#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
+	#pragma HLS ARRAY_PARTITION variable=S complete dim=1
+
+	datapack<CONFIG_T::head_dim_key, data_T> datav_pack;
+	#pragma HLS DATA_PACK variable=datav_pack
+
+	int multiplier_limit  = ceil(float(CONFIG_T::seq_len*CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
+	CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
 
 	data_T dataV[CONFIG_T::seq_len*CONFIG_T::head_dim_value];
 	# pragma HLS ARRAY_PARTITION variable=dataV complete dim=1
+
 	for (int j=0; j <CONFIG_T::seq_len; ++j){
 	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+		datav_pack = V.read();
 		for (int i=0; i <CONFIG_T::head_dim_value; ++i){
 		#pragma HLS UNROLL
-			dataV[CONFIG_T::seq_len*i+j]=V[i].read();
+			dataV[CONFIG_T::seq_len*i+j]=datav_pack.data[i];
 		}
 	}
 
@@ -121,7 +159,7 @@ void matrixmul(
             // compute (S)i,j
             Sij = 0;
             product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
-            	S_1 = QKi[k] * dataV[j*CONFIG_T::seq_len + k];
+            	S_1 = CONFIG_T::template product<data_T, data_T>::product(QKi[k], dataV[j*CONFIG_T::seq_len + k]);
             	Sij += S_1;
             }
             S[j].write(Sij);
@@ -129,115 +167,62 @@ void matrixmul(
     }
 }
 
-template<class data_T, class res_T, typename CONFIG_T>
-void dense_value(
-	hls::stream<data_T>    data_v[CONFIG_T::feature_dim],
-	hls::stream<data_T>    v_proj[CONFIG_T::head_dim_value],
-    typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
-    typename CONFIG_T::bias_t    value_bias[CONFIG_T::head_dim_value])
-{
-	#pragma HLS ARRAY_PARTITION variable=data_v complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
-
-    v_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-	#pragma HLS DATAFLOW
-    	data_T v_row[CONFIG_T::head_dim_value];
-    	data_T dense_in[CONFIG_T::feature_dim];
-		#pragma HLS ARRAY_PARTITION variable=v_row complete dim=1
-		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
-        for (int k=0; k<CONFIG_T::feature_dim; ++k){
-		#pragma HLS UNROLL
-    		dense_in[k] = data_v[k].read();
-    	}
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, v_row, value_weight, value_bias);
-        for (int k=0; k <CONFIG_T::head_dim_value; ++k){
-		#pragma HLS UNROLL
-            v_proj[k].write(v_row[k]);
-        }
-    }
-}
-
-
-
-//template<class data_T, class res_T, typename CONFIG_T>
-//void value_prep(
-//	hls::stream<data_T>    v_proj[CONFIG_T::head_dim_value],
-//	data_T  V[CONFIG_T::head_dim_value * CONFIG_T::seq_len])
-//{
-//	data_T data[CONFIG_T::seq_len*CONFIG_T::head_dim_value];
-//	# pragma HLS ARRAY_PARTITION variable=V complete dim=1
-//	# pragma HLS ARRAY_PARTITION variable=data complete dim=1
-//	for (int j=0; j <CONFIG_T::seq_len; ++j){
-//		for (int i=0; i <CONFIG_T::head_dim_value; ++i){
-//		#pragma HLS UNROLL
-//			data[CONFIG_T::seq_len*i+j]=v_proj[i].read();
-//		}
-//	}
-//	for (int i=0; i <CONFIG_T::head_dim_value; ++i){
-//		for (int j=0; j <CONFIG_T::seq_len; ++j){
-//		#pragma HLS UNROLL
-//			V[CONFIG_T::seq_len*i +j]=data[CONFIG_T::seq_len*i+j];
-//		}
-//	}
-//}
-
-
 
 template<class data_T, class res_T, typename CONFIG_T>
-void dense_query(
+void lin_projection(
 	hls::stream<data_T>    data_q[CONFIG_T::feature_dim],
-	hls::stream<data_T>    q_proj[CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-    typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_key])
-{
-	#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
-
-    q_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-	#pragma HLS DATAFLOW
-    	data_T proj[CONFIG_T::head_dim_key];
-		data_T dense_in [CONFIG_T::feature_dim];
-		#pragma HLS ARRAY_PARTITION variable=proj complete dim=1
-		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
-
-		for (int k=0; k<CONFIG_T::feature_dim; ++k){
-		#pragma HLS UNROLL
-			dense_in[k] = data_q[k].read();
-		}
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, proj, query_weight, query_bias);
-        update_proj: for (int i=0; i<CONFIG_T::head_dim_key; ++i){
-		#pragma HLS UNROLL
-        	q_proj[i].write(proj[i]);
-        }
-    }
-}
-
-template<class data_T, class res_T, typename CONFIG_T>
-void dense_key(
-	hls::stream<data_T>    data_k[CONFIG_T::feature_dim],
-	hls::stream<data_T>    k_proj[CONFIG_T::head_dim_key],
+	hls::stream<data_T>    data_vk[CONFIG_T::feature_dim],
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>>    &k_proj,
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>>    &q_proj,
+	hls::stream<datapack<CONFIG_T::head_dim_value, data_T>>  &v_proj,
     typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-    typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_key])
+    typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_key],
+	typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
+	typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_key],
+	typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
+	typename CONFIG_T::bias_t    value_bias[CONFIG_T::head_dim_value]
+	)
+
 {
-	#pragma HLS ARRAY_PARTITION variable=data_k complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+	#pragma HLS DATA_PACK variable=k_proj
+	#pragma HLS DATA_PACK variable=q_proj
+	#pragma HLS DATA_PACK variable=v_proj
 
-    k_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-	#pragma HLS DATAFLOW
-    	data_T proj[CONFIG_T::head_dim_key];
-    	data_T dense_in[CONFIG_T::feature_dim];
-		#pragma HLS ARRAY_PARTITION variable=proj complete dim=1
-		#pragma HLS ARRAY_PARTITION variable=dense_in complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=data_vk complete dim=1
 
-		for (int k=0; k<CONFIG_T::feature_dim; ++k){
-		#pragma HLS UNROLL
-			dense_in[k] = data_k[k].read();
-		}
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(dense_in, proj, key_weight, key_bias);
-        update_proj: for (int i=0; i <CONFIG_T::head_dim_key; ++i){
-		#pragma HLS UNROLL
-        	k_proj[i].write(proj[i]);
-		}
+    k_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
+	#pragma HLS PIPELINE
+
+    	data_T proj_k[CONFIG_T::head_dim_key];
+    	data_T proj_q[CONFIG_T::head_dim_key];
+    	data_T proj_v[CONFIG_T::head_dim_value];
+    	data_T in_q[CONFIG_T::feature_dim];
+		data_T in_v[CONFIG_T::feature_dim];
+		#pragma HLS ARRAY_PARTITION variable=proj_k complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=proj_q complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=proj_v complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=in_q complete dim=1
+		#pragma HLS ARRAY_PARTITION variable=in_v complete dim=1
+
+
+		datapack<CONFIG_T::head_dim_key, data_T> proj_k_pack;
+		datapack<CONFIG_T::head_dim_key, data_T> proj_q_pack;
+		datapack<CONFIG_T::head_dim_value, data_T> proj_v_pack;
+		#pragma HLS DATA_PACK variable=proj_k_pack
+		#pragma HLS DATA_PACK variable=proj_q_pack
+		#pragma HLS DATA_PACK variable=proj_v_pack
+
+		read_stream_array<data_T, CONFIG_T::feature_dim>(data_q, in_q);
+		read_stream_array<data_T, CONFIG_T::feature_dim>(data_vk, in_v);
+
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_k_pack.data, key_weight, key_bias);
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_q, proj_q_pack.data, query_weight, query_bias);
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_v_pack.data, value_weight, value_bias);
+
+        k_proj.write(proj_k_pack);
+		q_proj.write(proj_q_pack);
+		v_proj.write(proj_v_pack);
     }
 }
 
@@ -250,10 +235,12 @@ void dense_out(
     typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::feature_dim])
 {
 	data_T mat_res_con[CONFIG_T::num_heads*CONFIG_T::head_dim_value];
+	res_T dense_out[CONFIG_T::feature_dim];
 	#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
+	#pragma HLS ARRAY_PARTITION variable=dense_out complete dim=1
 	output_dense: for (int k=0; k <CONFIG_T::seq_len; ++k){
-	#pragma HLS DATAFLOW
-//	#pragma HLS PIPELINE
+
+	#pragma HLS PIPELINE
 		for (int i=0;i<CONFIG_T::num_heads; ++i){
 		#pragma HLS UNROLL
 			for (int j=0;j<CONFIG_T::head_dim_value; ++j){
@@ -261,30 +248,24 @@ void dense_out(
 				mat_res_con[CONFIG_T::head_dim_value*i+j]=data_in[i][j].read();
 			}
 		}
-		dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, res+(CONFIG_T::feature_dim*k), attention_output_weight, attention_output_bias);
-		// nnet::print_result<result_t, CONFIG_T::feature_dim>( res+(CONFIG_T::feature_dim*j), std::cout);
+		dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, dense_out, attention_output_weight, attention_output_bias);
+		for (int i=0;i<CONFIG_T::feature_dim; ++i){
+		#pragma HLS UNROLL
+			res[CONFIG_T::feature_dim*k+i] = dense_out[i];
+		}
 	}
 }
 
 template<class data_T, class res_T, typename CONFIG_T>
 void data_prep(
-	data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-	data_T    data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-	hls::stream<data_T> d_value[CONFIG_T::feature_dim],
-	hls::stream<data_T> d_query[CONFIG_T::feature_dim],
-	hls::stream<data_T> d_key[CONFIG_T::feature_dim])
+	data_T    data[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+	hls::stream<data_T> d[CONFIG_T::feature_dim])
 {
-	#pragma HLS ARRAY_PARTITION variable=d_value complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=d_key complete dim=1
-
+	#pragma HLS ARRAY_PARTITION variable=d complete dim=1
 	for (int j=0; j<CONFIG_T::seq_len; ++j){
-		#pragma HLS PIPELINE
 		for (int k=0; k<CONFIG_T::feature_dim; ++k){
-		#pragma HLS DATAFLOW
-		d_value[k].write(data_vk[j*k]);
-		d_key[k].write(data_vk[j*k]);
-		d_query[k].write(data_q[j*k]);
+		#pragma HLS UNROLL
+		d[k].write(data[j*CONFIG_T::feature_dim + k]);
 		}
 	}
 }
@@ -306,49 +287,42 @@ void multiheadattention(
 {
 	hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
 	hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-	hls::stream<data_T> d_key[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-	hls::stream<data_T> q_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_key];
-	hls::stream<data_T> k_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_key];
-	hls::stream<data_T> v_proj[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
-//    data_T v_reshape[CONFIG_T::num_heads][CONFIG_T::head_dim_value][CONFIG_T::seq_len];
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> q_proj[CONFIG_T::num_heads];
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> k_proj[CONFIG_T::num_heads];
+	hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> v_proj[CONFIG_T::num_heads];
     data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
     hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
 
 	#pragma HLS DATAFLOW
+	#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
 	#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
 	#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
 	#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
-//	#pragma HLS ARRAY_PARTITION variable=v_reshape complete dim=1
 	#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
 	#pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
     // std::cout << "input to MHA: " << std::endl;
     // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
     // std::cout << " " << std::endl;
 
-    dataprep: for (int i=0;i<CONFIG_T::num_heads; ++i){
-	#pragma HLS UNROLL
-    	data_prep<data_T, res_T, CONFIG_T>(data_q, data_vk, d_value[i], d_query[i], d_key[i]);
-	}
-
-
-    // linear projection
-    d_value: for (int i=0;i<CONFIG_T::num_heads; ++i){
-	#pragma HLS UNROLL
-    	dense_value<data_T, res_T, CONFIG_T>(d_value[i], v_proj[i], value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
+    prepq: for (int i=0;i<CONFIG_T::num_heads; ++i){
+		#pragma HLS UNROLL
+    	nnet::data_prep<data_T, res_T, CONFIG_T>(data_q, d_query[i]);
     }
-    d_query: for (int i=0;i<CONFIG_T::num_heads; ++i){
-	#pragma HLS UNROLL
-    	dense_query<data_T, res_T, CONFIG_T>(d_query[i], q_proj[i], query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i));
+	prepvk: for (int i=0;i<CONFIG_T::num_heads; ++i){
+		#pragma HLS UNROLL
+    	nnet::data_prep<data_T, res_T, CONFIG_T>(data_vk, d_value[i]);
     }
-    d_key: for (int i=0;i<CONFIG_T::num_heads; ++i){
-	#pragma HLS UNROLL
-    dense_key<data_T, res_T, CONFIG_T>(d_key[i], k_proj[i], key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i));
+
+    // linear projection
+    lin_proj: for (int i=0;i<CONFIG_T::num_heads; ++i){
+    	#pragma HLS UNROLL
+    	nnet::lin_projection<data_T, res_T, CONFIG_T>(
+    			d_query[i], d_value[i],
+    			k_proj[i], q_proj[i], v_proj[i],
+				key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i),
+				query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i),
+				value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
     }
-    
-//    value_reshape: for (int i=0; i < CONFIG_T::num_heads; ++i){
-//	#pragma HLS UNROLL
-//    	nnet::value_prep<data_T, res_T, CONFIG_T>(v_proj[i], v_reshape[i]);
-//    }
 
     maxtrixmul1: for (int i=0; i < CONFIG_T::num_heads; ++i){
 	#pragma HLS UNROLL
@@ -360,11 +334,10 @@ void multiheadattention(
     	nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]);//stream
     }
 
-    dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
+    nnet::dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
     // std::cout << "output from MHA: " << std::endl;
     // std::cout << " " << std::endl;
 }
 }
 
 #endif
-
diff --git a/hls4ml/templates/vivado/vivado_synth.tcl b/hls4ml/templates/vivado/vivado_synth.tcl
index 4634b166f6..5a2bcfc453 100644
--- a/hls4ml/templates/vivado/vivado_synth.tcl
+++ b/hls4ml/templates/vivado/vivado_synth.tcl
@@ -1,6 +1,6 @@
 set tcldir [file dirname [info script]]
 source [file join $tcldir project.tcl]
 
-add_files ${project_name}_prj/solution1/syn/vhdl
+add_files ${project_name}_prj/solution1/syn/verilog
 synth_design -top ${project_name} -part $part
 report_utilization -file vivado_synth.rpt

From 5d28f581ae8817d9dbaddb7605663c0d995a915c Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Fri, 14 Apr 2023 17:48:24 -0700
Subject: [PATCH 27/55] update bit precision

---
 .../backends/vivado/passes/core_templates.py  |   3 +
 .../vivado/passes/transformer_templates.py    |   3 +
 hls4ml/backends/vivado/vivado_backend.py      |  19 ++-
 .../vivado/nnet_utils/nnet_activation.h       | 121 ++++++++----------
 .../nnet_utils/nnet_multiheadattention.h      |   7 +-
 5 files changed, 81 insertions(+), 72 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index f0090dcb2a..41e5796917 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -183,6 +183,9 @@ def format(self, node):
     static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
+    typedef {accum_t.name} accum_t;
+    static const unsigned inv_range = {inv_range};
+    static const unsigned exp_range = {exp_range};
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 96aad94625..183ffdcd7d 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -30,6 +30,9 @@
     static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef ap_{table_t} exp_table_t;
     typedef ap_{table_t} inv_table_t;
+    typedef {accum_t.name} accum_t;
+    static const unsigned inv_range = {inv_range};
+    static const unsigned exp_range = {exp_range};
 }};\n"""
 
                                                                   
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index dffc3f77f5..08ac92a50d 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -27,6 +27,8 @@
     SeparableConv2D,
     SimpleRNN,
     Softmax,
+    LayerNormalization, 
+    MultiHeadAttention
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
@@ -497,6 +499,12 @@ def init_softmax(self, layer):
             layer.set_attr('exp_table_t', layer.get_attr('table_t'))
         if 'inv_table_t' not in layer.attributes:
             layer.set_attr('inv_table_t', layer.get_attr('table_t'))
+        if 'accum_t' not in layer.attributes:
+            layer.set_attr('accum_t', FixedPrecisionType(width=18, integer=8))  
+        if 'inv_range' not in layer.attributes:
+            layer.set_attr('inv_range', 128)  
+        if 'exp_range' not in layer.attributes:
+            layer.set_attr('exp_range', 8)  
         if layer.model.config.is_resource_strategy(layer):
             # 'resource' strategy = 'latency' for Softmax
             # layer.set_attr('implementation', 'latency')
@@ -514,7 +522,7 @@ def init_softmax(self, layer):
     @layer_optimizer(LayerNormalization)
     def init_layernormalization(self, layer):
         if 'table_t' not in layer.attributes:
-            layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=5)))
+            layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=8)))
         if 'table_size' not in layer.attributes:
             layer.set_attr('table_size', 2048)  #table size
         if 'table_range' not in layer.attributes:
@@ -594,9 +602,16 @@ def init_mha(self, layer):
         index_t = IntegerPrecisionType(width=1, signed=False)
         layer.set_attr('index_t', index_t)
         if 'table_t' not in layer.attributes:
-            layer.set_attr('table_t', FixedPrecisionType(width=32, integer=5))
+            layer.set_attr('table_t', FixedPrecisionType(width=24, integer=8))
         if 'table_size' not in layer.attributes:
             layer.set_attr('table_size', 2048)
+        if 'accum_t' not in layer.attributes:
+            layer.set_attr('accum_t', FixedPrecisionType(width= 24, integer=8))  
+        if 'inv_range' not in layer.attributes:
+            layer.set_attr('inv_range', 128)  
+        if 'exp_range' not in layer.attributes:
+            layer.set_attr('exp_range', 8)
         layer.set_attr('strategy', 'resource')  #latency
+        
 
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 968ad07c97..d5e30dc71b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -269,43 +269,49 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     }
 }
 
-template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+template<typename CONFIG_T, int N_TABLE>
+void init_exp_table_legacy(typename CONFIG_T::exp_table_t table_out[N_TABLE])
+{
+    float exp_range = (float) CONFIG_T::exp_range;
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
         float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = exp_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
+        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
+template<typename CONFIG_T, int N_TABLE>
+void init_invert_table_legacy(typename CONFIG_T::inv_table_t table_out[N_TABLE])
+{
+    float inv_range = (float) CONFIG_T::inv_range;
     // Inversion function:
     //   result = 1/x
     for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +64)
-        float in_val = 64.0 * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        if (in_val > 0.0)
-            table_out[ii] = 1.0 / in_val;
-        else
-            table_out[ii] = 0.0;
+        float in_val = inv_range*ii/float(N_TABLE);
+        if (in_val > 0.0) table_out[ii] = 1.0/in_val;
+        else table_out[ii] = 0.0;
     }
 }
 
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template<class data_T, class res_T, typename CONFIG_T>
+void  softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
+{
+	#pragma HLS pipeline
+    int exp_range = CONFIG_T::exp_range;
+    int inv_range = CONFIG_T::inv_range;
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
 #endif
     if (!initialized) {
         init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
@@ -314,68 +320,47 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     }
     
     // Index into the lookup table based on data for exponentials
-    typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
-    typename CONFIG_T::table_t exp_diff_res;            // different, independent, fixed point precision
-    data_T data_cache[CONFIG_T::n_in];
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision
+    typename CONFIG_T::exp_table_t exp_diff_res;// different, independent, fixed point precision
+    typename CONFIG_T::exp_table_t data_cache[CONFIG_T::n_in];
     int data_round;
     int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_cache[ii] = data[ii];
-        exp_res[ii] = 0;
-    }
 
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
-            if (ii == jj)
-                exp_diff_res = 1;
-            else {
-                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16;
-                index = data_round + 8 * CONFIG_T::table_size / 16;
-                if (index < 0)
-                    index = 0;
-                if (index > CONFIG_T::table_size - 1)
-                    index = CONFIG_T::table_size - 1;
-                exp_diff_res = exp_table[index];
-            }
-            exp_res[ii] += exp_diff_res;
-        }
-    }
+#pragma HLS array_partition variable=data_cache complete
 
-    // Second loop to invert
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        int exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64;
-        if (exp_res_index < 0)
-            exp_res_index = 0;
-        if (exp_res_index > CONFIG_T::table_size - 1)
-            exp_res_index = CONFIG_T::table_size - 1;
-        // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
-        res[ii] = (res_T)invert_table[exp_res_index];
-    }
-}
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    for (int i = 0; i < CONFIG_T::n_in; i++) {
-        #pragma HLS UNROLL
-        res[i] = (res_T)0;
+    typename CONFIG_T::accum_t denominator;
+    typename CONFIG_T::inv_table_t deno_inver;
+    // std::cout << "denominator: " << std::endl;    /////
+    denominator = 0;
+    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+		data_round = data[ii]*CONFIG_T::table_size/(exp_range*2);
+		index = data_round + exp_range*CONFIG_T::table_size/(exp_range*2);
+		if (index < 0)   index = 0;
+		if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+		denominator += exp_table[index];
+        // std::cout << " index: " << index << " " <<exp_table[index] ;   /////
+		data_cache[ii] = exp_table[index];
     }
+    // std::cout << "end  " << std::endl;    /////
 
-    data_T maximum = data[0];
-    int idx = 0;
 
-    for (int i = 1; i < CONFIG_T::n_in; i++) {
-        #pragma HLS PIPELINE
-        if (data[i] > maximum) {
-            maximum = data[i];
-            idx = i;
-        }
-    }
+    //using lookup table for inverse
+	int exp_res_index = denominator*CONFIG_T::table_size/inv_range;
+	if (exp_res_index < 0)   exp_res_index = 0;
+	if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
+	deno_inver = invert_table[exp_res_index];
+
+
+	for (int ii=0; ii<CONFIG_T::n_in; ii++) {
+		res[ii] = (res_T) (data_cache[ii]*deno_inver);
+	}
+
 
-    res[idx] = (res_T)1;
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+template<class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
     #pragma HLS inline
     switch (CONFIG_T::implementation) {
     case softmax_implementation::latency:
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 0ed0e0e0a4..8548e1125b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -61,7 +61,8 @@ void matrixmul_transpose(
     res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
     const data_T dk = 1.0/sqrt(CONFIG_T::head_dim_key);
-    data_T QKij, QK_1;
+    data_T QK_1;
+	typename CONFIG_T::accum_t QKij;
     data_T Qi[CONFIG_T::head_dim_key];
     data_T Product[CONFIG_T::seq_len];// seq_Q, seq_K
     data_T qk_smout[CONFIG_T::seq_len];
@@ -335,8 +336,10 @@ void multiheadattention(
     }
 
     nnet::dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
-    // std::cout << "output from MHA: " << std::endl;
+	// std::cout << "out MHA: " << std::endl;
+    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(res, std::cout);
     // std::cout << " " << std::endl;
+
 }
 }
 

From 2fc68d0a01c55396e824b502b4f955131813bd11 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Sun, 16 Apr 2023 18:06:04 -0700
Subject: [PATCH 28/55] config update

---
 hls4ml/backends/vivado/passes/transformer_templates.py | 4 ++--
 hls4ml/backends/vivado/vivado_backend.py               | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 183ffdcd7d..16992991ee 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -28,8 +28,8 @@
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
     static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
-    typedef ap_{table_t} exp_table_t;
-    typedef ap_{table_t} inv_table_t;
+    typedef {table_t.name} exp_table_t;
+    typedef {table_t.name} inv_table_t;
     typedef {accum_t.name} accum_t;
     static const unsigned inv_range = {inv_range};
     static const unsigned exp_range = {exp_range};
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 08ac92a50d..20f77334f1 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -602,7 +602,7 @@ def init_mha(self, layer):
         index_t = IntegerPrecisionType(width=1, signed=False)
         layer.set_attr('index_t', index_t)
         if 'table_t' not in layer.attributes:
-            layer.set_attr('table_t', FixedPrecisionType(width=24, integer=8))
+            layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=24, integer=8)))
         if 'table_size' not in layer.attributes:
             layer.set_attr('table_size', 2048)
         if 'accum_t' not in layer.attributes:

From b5c95cf519bab3b994ee55397f6781f88e012bc8 Mon Sep 17 00:00:00 2001
From: Ethan <zhixij@uw.edu>
Date: Thu, 20 Apr 2023 17:14:01 -0700
Subject: [PATCH 29/55] add some comment

---
 .../vivado/nnet_utils/nnet_activation.h       | 24 ++++++---
 .../vivado/nnet_utils/nnet_layernorm.h        |  5 ++
 vivado.jou                                    | 21 ++++++++
 vivado.log                                    | 52 +++++++++++++++++++
 4 files changed, 96 insertions(+), 6 deletions(-)
 create mode 100644 vivado.jou
 create mode 100644 vivado.log

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index d5e30dc71b..cfe169f123 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -326,31 +326,43 @@ void  softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
     int data_round;
     int index;
 
+    // std::cout << "input to SM: " << std::endl;              /////
+    // nnet::print_result<data_T, CONFIG_T::n_in>(data, std::cout);  /////
+    // std::cout << " " << std::endl;   /////
+
 #pragma HLS array_partition variable=data_cache complete
 
 
     typename CONFIG_T::accum_t denominator;
     typename CONFIG_T::inv_table_t deno_inver;
-    // std::cout << "denominator: " << std::endl;    /////
+
     denominator = 0;
     for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-		data_round = data[ii]*CONFIG_T::table_size/(exp_range*2);
-		index = data_round + exp_range*CONFIG_T::table_size/(exp_range*2);
+		data_round = data[ii]*(CONFIG_T::table_size/(exp_range*2));
+        // std::cout << " data, round: " << data[ii] << " " << data_round << std::endl;  /////
+		index = data_round + exp_range*(CONFIG_T::table_size/(exp_range*2));
+        // std::cout << " index: " << index;   /////
 		if (index < 0)   index = 0;
 		if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
 		denominator += exp_table[index];
-        // std::cout << " index: " << index << " " <<exp_table[index] ;   /////
+        // std::cout << "   denominator " << index << std::endl;   /////
+        // std::cout << "   denominator " << denominator << std::endl;   /////
 		data_cache[ii] = exp_table[index];
     }
     // std::cout << "end  " << std::endl;    /////
 
 
     //using lookup table for inverse
-	int exp_res_index = denominator*CONFIG_T::table_size/inv_range;
+	int exp_res_index = denominator*(CONFIG_T::table_size/inv_range);
+    
+    // std::cout << " denominator: " << denominator << std::endl;  /////
+    // std::cout << " table_size: " << CONFIG_T::table_size << std::endl;  /////
+    // std::cout << " inv_range: " << inv_range << std::endl;  /////
+    // std::cout << " exp_res_index: " << exp_res_index << std::endl;  /////
 	if (exp_res_index < 0)   exp_res_index = 0;
 	if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
 	deno_inver = invert_table[exp_res_index];
-
+    // std::cout << " deno_inver: " << deno_inver << std::endl;  /////
 
 	for (int ii=0; ii<CONFIG_T::n_in; ii++) {
 		res[ii] = (res_T) (data_cache[ii]*deno_inver);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index f975486b68..ac94f22235 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -318,6 +318,11 @@ void layernormalize(
             res[j*dim+i] = outval[i];
         }
     }
+
+    // std::cout << "out Norm: " << std::endl;
+    // nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
+    // std::cout << " " << std::endl;
+
 }
 
 }
diff --git a/vivado.jou b/vivado.jou
new file mode 100644
index 0000000000..3828028f65
--- /dev/null
+++ b/vivado.jou
@@ -0,0 +1,21 @@
+#-----------------------------------------------------------
+# Vivado v2019.2 (64-bit)
+# SW Build 2708876 on Wed Nov  6 21:39:14 MST 2019
+# IP Build 2700528 on Thu Nov  7 00:09:20 MST 2019
+# Start of session at: Mon Apr 17 20:02:54 2023
+# Process ID: 963
+# Current directory: /home/ej/workspace/hls4ml/hls4ml
+# Command line: vivado
+# Log file: /home/ej/workspace/hls4ml/hls4ml/vivado.log
+# Journal file: /home/ej/workspace/hls4ml/hls4ml/vivado.jou
+#-----------------------------------------------------------
+start_gui
+create_project BDT_vivado /home/ej/workspace/fwX/BDT_vivado -part xcvu9p-flga2104-2L-e
+set_property board_part xilinx.com:vcu118:part0:2.3 [current_project]
+add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V_rom.dat}
+add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_32_12_udo.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mac_muladdwdI.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_42_12_vdy.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs.v}
+update_compile_order -fileset sources_1
+update_compile_order -fileset sources_1
+launch_runs impl_1 -jobs 8
+wait_on_run impl_1
+open_run impl_1
diff --git a/vivado.log b/vivado.log
new file mode 100644
index 0000000000..3a5702e20b
--- /dev/null
+++ b/vivado.log
@@ -0,0 +1,52 @@
+#-----------------------------------------------------------
+# Vivado v2019.2 (64-bit)
+# SW Build 2708876 on Wed Nov  6 21:39:14 MST 2019
+# IP Build 2700528 on Thu Nov  7 00:09:20 MST 2019
+# Start of session at: Mon Apr 17 20:02:54 2023
+# Process ID: 963
+# Current directory: /home/ej/workspace/hls4ml/hls4ml
+# Command line: vivado
+# Log file: /home/ej/workspace/hls4ml/hls4ml/vivado.log
+# Journal file: /home/ej/workspace/hls4ml/hls4ml/vivado.jou
+#-----------------------------------------------------------
+start_gui
+create_project BDT_vivado /home/ej/workspace/fwX/BDT_vivado -part xcvu9p-flga2104-2L-e
+INFO: [IP_Flow 19-234] Refreshing IP repositories
+INFO: [IP_Flow 19-1704] No user IP repositories specified
+INFO: [IP_Flow 19-2313] Loaded Vivado IP repository '/opt/Xilinx/Vivado/2019.2/data/ip'.
+create_project: Time (s): cpu = 00:00:07 ; elapsed = 00:00:12 . Memory (MB): peak = 6734.895 ; gain = 33.020 ; free physical = 15830 ; free virtual = 33929
+set_property board_part xilinx.com:vcu118:part0:2.3 [current_project]
+add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V_rom.dat}
+add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_32_12_udo.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mac_muladdwdI.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_42_12_vdy.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs.v}
+update_compile_order -fileset sources_1
+update_compile_order -fileset sources_1
+launch_runs impl_1 -jobs 8
+[Mon Apr 17 20:07:26 2023] Launched synth_1...
+Run output will be captured here: /home/ej/workspace/fwX/BDT_vivado/BDT_vivado.runs/synth_1/runme.log
+[Mon Apr 17 20:07:26 2023] Launched impl_1...
+Run output will be captured here: /home/ej/workspace/fwX/BDT_vivado/BDT_vivado.runs/impl_1/runme.log
+CRITICAL WARNING: [Common 17-1649] The Vivado message database '/home/ej/workspace/fwX/BDT_vivado/BDT_vivado.runs/synth_1/vivado.pb' contains 24975 messages. Restoring all messages from this message database will impact Vivado performance, so only WARNING, CRITICAL WARNING, and ERROR messages will be restored. To restore all messages from this file use the tcl command 'set_param messaging.loadPbLimit 24976' and re-open the project.
+open_run impl_1
+INFO: [Device 21-403] Loading part xcvu9p-flga2104-2L-e
+Netlist sorting complete. Time (s): cpu = 00:00:00.01 ; elapsed = 00:00:00 . Memory (MB): peak = 7651.996 ; gain = 0.000 ; free physical = 13960 ; free virtual = 32677
+INFO: [Netlist 29-17] Analyzing 272 Unisim elements for replacement
+INFO: [Netlist 29-28] Unisim Transformation completed in 0 CPU seconds
+WARNING: [Netlist 29-101] Netlist 'fwXbdt' is not ideal for floorplanning, since the cellview 'fwXbdt' contains a large number of primitives.  Please consider enabling hierarchy in synthesis if you want to do floorplanning.
+INFO: [Project 1-479] Netlist was created with Vivado 2019.2
+INFO: [Project 1-570] Preparing netlist for logic optimization
+Reading XDEF placement.
+Reading placer database...
+Reading XDEF routing.
+Read XDEF File: Time (s): cpu = 00:00:00.08 ; elapsed = 00:00:00.09 . Memory (MB): peak = 7708.898 ; gain = 2.000 ; free physical = 13855 ; free virtual = 32573
+Restored from archive | CPU: 0.080000 secs | Memory: 2.751320 MB |
+Finished XDEF File Restore: Time (s): cpu = 00:00:00.08 ; elapsed = 00:00:00.09 . Memory (MB): peak = 7708.898 ; gain = 2.000 ; free physical = 13855 ; free virtual = 32573
+Netlist sorting complete. Time (s): cpu = 00:00:00 ; elapsed = 00:00:00 . Memory (MB): peak = 7732.711 ; gain = 0.000 ; free physical = 13855 ; free virtual = 32573
+INFO: [Project 1-111] Unisim Transformation Summary:
+  A total of 40 instances were transformed.
+  DSP48E2 => DSP48E2 (DSP_ALU, DSP_A_B_DATA, DSP_C_DATA, DSP_MULTIPLIER, DSP_M_DATA, DSP_OUTPUT, DSP_PREADD, DSP_PREADD_DATA): 2 instances
+  IBUF => IBUF (IBUFCTRL, INBUF): 38 instances
+
+open_run: Time (s): cpu = 00:00:19 ; elapsed = 00:00:25 . Memory (MB): peak = 8235.492 ; gain = 1086.348 ; free physical = 13483 ; free virtual = 32204
+WARNING: [Timing 38-313] There are no user specified timing constraints. Timing constraints are needed for proper timing analysis.
+exit
+INFO: [Common 17-206] Exiting Vivado at Mon Apr 17 20:24:36 2023...

From 3b8aa8d056ad5cda20838359be80e9fda82c087e Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Sep 2024 15:35:15 -0400
Subject: [PATCH 30/55] run pre-commit

---
 .github/ISSUE_TEMPLATE/bug_report.md          |    88 +-
 .github/ISSUE_TEMPLATE/config.yml             |    10 +-
 .github/ISSUE_TEMPLATE/feature_request.md     |    56 +-
 contrib/garnet.py                             |     2 +-
 docs/status.rst                               |    50 +-
 .../vivado/passes/broadcast_stream.py         |   234 +-
 .../vivado/passes/transformer_templates.py    |   293 +-
 .../vivado_accelerator/supported_boards.json  |    84 +-
 hls4ml/converters/keras/core.py               |   332 +-
 hls4ml/converters/keras/multiheadattention.py |    78 +-
 hls4ml/converters/keras/pooling.py            |   182 +-
 hls4ml/converters/keras/qkeras.py             |    37 +-
 hls4ml/converters/keras_to_hls.py             |   684 +-
 hls4ml/model/profiling.py                     |  1413 +-
 .../templates/quartus/ac_types/ac_channel.h   |  1110 +-
 .../templates/quartus/ac_types/ac_complex.h   |   890 +-
 hls4ml/templates/quartus/ac_types/ac_fixed.h  |  3092 ++--
 hls4ml/templates/quartus/ac_types/ac_float.h  |  2392 +--
 hls4ml/templates/quartus/ac_types/ac_int.h    |  6198 +++----
 hls4ml/templates/quartus/ac_types/ac_sc.h     |  1104 +-
 .../templates/quartus/ac_types/ac_std_float.h |  4636 ++---
 hls4ml/templates/quartus/ac_types/stream.h    |    70 +-
 hls4ml/templates/quartus/firmware/defines.h   |    94 +-
 .../templates/quartus/firmware/myproject.cpp  |    96 +-
 hls4ml/templates/quartus/firmware/myproject.h |    96 +-
 .../firmware/nnet_utils/nnet_batchnorm.h      |   208 +-
 .../quartus/firmware/nnet_utils/nnet_common.h |   142 +-
 .../quartus/firmware/nnet_utils/nnet_conv1d.h |   128 +-
 .../quartus/firmware/nnet_utils/nnet_dense.h  |   338 +-
 .../nnet_utils/nnet_dense_compressed.h        |   160 +-
 .../firmware/nnet_utils/nnet_helpers.h        |   280 +-
 .../quartus/firmware/nnet_utils/nnet_merge.h  |   498 +-
 .../quartus/firmware/nnet_utils/nnet_mult.h   |   226 +-
 .../firmware/nnet_utils/nnet_padding.h        |   198 +-
 .../quartus/myproject_test_parallel.cpp       |   224 +-
 hls4ml/templates/vivado/#vivado_synth.tcl#    |    12 +-
 hls4ml/templates/vivado/ap_types/ap_common.h  |   752 +-
 hls4ml/templates/vivado/ap_types/ap_decl.h    |   424 +-
 hls4ml/templates/vivado/ap_types/ap_fixed.h   |   720 +-
 .../templates/vivado/ap_types/ap_fixed_base.h |  4708 ++---
 .../templates/vivado/ap_types/ap_fixed_ref.h  |  1436 +-
 .../vivado/ap_types/ap_fixed_special.h        |   460 +-
 hls4ml/templates/vivado/ap_types/ap_int.h     |   660 +-
 .../templates/vivado/ap_types/ap_int_base.h   |  3770 ++--
 hls4ml/templates/vivado/ap_types/ap_int_ref.h |  2692 +--
 .../vivado/ap_types/ap_int_special.h          |   446 +-
 .../templates/vivado/ap_types/ap_shift_reg.h  |   276 +-
 .../vivado/ap_types/etc/ap_private.h          | 14398 ++++++++--------
 hls4ml/templates/vivado/ap_types/hls_stream.h |   526 +-
 .../vivado/ap_types/utils/x_hls_utils.h       |   160 +-
 hls4ml/templates/vivado/build_lib.sh          |    34 +-
 .../templates/vivado/firmware/myproject.cpp   |    46 +-
 hls4ml/templates/vivado/firmware/myproject.h  |    38 +-
 hls4ml/templates/vivado/myproject_test.cpp    |   188 +-
 .../vivado/nnet_utils/nnet_activation.h       |  1590 +-
 .../templates/vivado/nnet_utils/nnet_array.h  |   104 +-
 .../vivado/nnet_utils/nnet_batchnorm.h        |   248 +-
 .../vivado/nnet_utils/nnet_batchnorm_stream.h |   246 +-
 .../templates/vivado/nnet_utils/nnet_common.h |   150 +-
 .../templates/vivado/nnet_utils/nnet_conv1d.h |   132 +-
 .../vivado/nnet_utils/nnet_conv1d_stream.h    |   178 +-
 .../templates/vivado/nnet_utils/nnet_conv2d.h |   150 +-
 .../vivado/nnet_utils/nnet_conv2d_latency.h   |   178 +-
 .../templates/vivado/nnet_utils/nnet_dense.h  |   117 +-
 .../vivado/nnet_utils/nnet_dense_compressed.h |   180 +-
 .../vivado/nnet_utils/nnet_dense_latency.h    |   144 +-
 .../vivado/nnet_utils/nnet_dense_resource.h   |   526 +-
 .../vivado/nnet_utils/nnet_dense_seq.h        |    91 +-
 .../templates/vivado/nnet_utils/nnet_garnet.h |  1632 +-
 .../vivado/nnet_utils/nnet_helpers.h          |   764 +-
 .../vivado/nnet_utils/nnet_layernorm.h        |   734 +-
 .../templates/vivado/nnet_utils/nnet_merge.h  |   512 +-
 .../vivado/nnet_utils/nnet_merge_stream.h     |   740 +-
 .../templates/vivado/nnet_utils/nnet_mult.h   |   232 +-
 .../nnet_utils/nnet_multiheadattention.h      |   683 +-
 .../vivado/nnet_utils/nnet_padding.h          |   290 +-
 .../vivado/nnet_utils/nnet_pooling.h          |   626 +-
 .../vivado/nnet_utils/nnet_recr_activations.h |   112 +-
 .../vivado/nnet_utils/nnet_recurrent.h        |  1172 +-
 hls4ml/templates/vivado/vivado_synth.tcl      |    12 +-
 .../krnl_rtl_src/krnl_rtl_control_s_axi.v     |   844 +-
 .../alveo/python_drivers/axi_stream_driver.py |   202 +-
 .../pynq-z2/tcl_scripts/axi_lite_design.tcl   |    52 +-
 .../pynq-z2/tcl_scripts/axi_stream_design.tcl |   118 +-
 .../zcu102/tcl_scripts/axi_stream_design.tcl  |   116 +-
 hls4ml/utils/config.py                        |     6 +-
 hls4ml/utils/plot.py                          |   448 +-
 hs_err_pid6927.log                            |    17 -
 test/pytest/test_precision_parsing.py         |    29 +
 vivado.jou                                    |    21 -
 vivado.log                                    |    52 -
 91 files changed, 35515 insertions(+), 35402 deletions(-)
 delete mode 100644 hs_err_pid6927.log
 create mode 100644 test/pytest/test_precision_parsing.py
 delete mode 100644 vivado.jou
 delete mode 100644 vivado.log

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index d0aa96a65b..1f0191f232 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,44 +1,44 @@
----
-name: Bug report
-about: Something isn't working as expected
-title: ''
-labels: bug
-assignees: ''
-
----
-
-
-## Prerequisites
-Please make sure to check off these prerequisites before submitting a bug report.
-- [ ] Test that the bug appears on the current version of the master branch. Make sure to include the commit hash of the commit you checked out.
-- [ ] Check that the issue hasn't already been reported, by checking the currently open issues.
-- [ ] If there are steps to reproduce the problem, make sure to write them down below.
-- [ ] If relevant, please include the hls4ml project files, which were created directly before and/or after the bug.
-
-## Quick summary
-Please give a brief and concise description of the bug.
-
-## Details
-Please add to the following sections to describe the bug as accurately as possible.
-
-### Steps to Reproduce
-Add what needs to be done to reproduce the bug. Add *commented* code examples and make sure to include the original model files / code, and the commit hash you are working on.
-
-1. Clone the hls4ml repository
-2. Checkout the master branch, with commit hash: [...]
-3. Run conversion [...] on model file with code [...]
-4. [Further steps ...]
-
-### Expected behavior
-Please add a brief description of what you expected to happen.
-
-### Actual behavior
-Describe what actually happens instead.
-
-## Optional
-
-### Possible fix
-If you already know where the issue stems from, or you have a hint please let us know.
-
-### Additional context
-Add any other context about the problem here.
+---
+name: Bug report
+about: Something isn't working as expected
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+
+## Prerequisites
+Please make sure to check off these prerequisites before submitting a bug report.
+- [ ] Test that the bug appears on the current version of the master branch. Make sure to include the commit hash of the commit you checked out.
+- [ ] Check that the issue hasn't already been reported, by checking the currently open issues.
+- [ ] If there are steps to reproduce the problem, make sure to write them down below.
+- [ ] If relevant, please include the hls4ml project files, which were created directly before and/or after the bug.
+
+## Quick summary
+Please give a brief and concise description of the bug.
+
+## Details
+Please add to the following sections to describe the bug as accurately as possible.
+
+### Steps to Reproduce
+Add what needs to be done to reproduce the bug. Add *commented* code examples and make sure to include the original model files / code, and the commit hash you are working on.
+
+1. Clone the hls4ml repository
+2. Checkout the master branch, with commit hash: [...]
+3. Run conversion [...] on model file with code [...]
+4. [Further steps ...]
+
+### Expected behavior
+Please add a brief description of what you expected to happen.
+
+### Actual behavior
+Describe what actually happens instead.
+
+## Optional
+
+### Possible fix
+If you already know where the issue stems from, or you have a hint please let us know.
+
+### Additional context
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 907ac6db49..776bc33c31 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,5 @@
-blank_issues_enabled: false
-contact_links:
-  - name: Talk and engage with the comunity
-    url: https://github.com/fastmachinelearning/hls4ml/discussions/categories/general
-    about: Check out the GitHub discusisons page for hls4ml. This is the best way to get in touch with us. In particular, if you have a question about hls4ml or a general problem that is likely not a bug.
+blank_issues_enabled: false
+contact_links:
+  - name: Talk and engage with the comunity
+    url: https://github.com/fastmachinelearning/hls4ml/discussions/categories/general
+    about: Check out the GitHub discusisons page for hls4ml. This is the best way to get in touch with us. In particular, if you have a question about hls4ml or a general problem that is likely not a bug.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 1739f9d99f..84a6247d50 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,28 +1,28 @@
----
-name: Feature request
-about: Suggest an idea for hls4ml
-title: ''
-labels: enhancement
-assignees: ''
-
----
-
-## Prerequisites
-Please talk to us before creating a new feature request. So that you can check that the idea is not already in active development.
-
-You can present your idea over here at the GitHub discussions page for hls4ml: https://github.com/fastmachinelearning/hls4ml/discussions/categories/ideas
-
-Even if an idea is already being worked on you can still create a feature request,
-if you would like to open a discussion about the feature or want to contribute to it.
-
-## Details
-Please add to the following sections to describe the feature as accurately as possible.
-
-### New behavior
-Please add a brief and concise description of what you would like to happen in hls4ml in the future.
-
-### Motivation
-Please tell us why this feature is important to the community.
-
-### Parts of hls4ml being affected
-Please describe which parts of hls4ml would be affected by this feature.
+---
+name: Feature request
+about: Suggest an idea for hls4ml
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+## Prerequisites
+Please talk to us before creating a new feature request. So that you can check that the idea is not already in active development.
+
+You can present your idea over here at the GitHub discussions page for hls4ml: https://github.com/fastmachinelearning/hls4ml/discussions/categories/ideas
+
+Even if an idea is already being worked on you can still create a feature request,
+if you would like to open a discussion about the feature or want to contribute to it.
+
+## Details
+Please add to the following sections to describe the feature as accurately as possible.
+
+### New behavior
+Please add a brief and concise description of what you would like to happen in hls4ml in the future.
+
+### Motivation
+Please tell us why this feature is important to the community.
+
+### Parts of hls4ml being affected
+Please describe which parts of hls4ml would be affected by this feature.
diff --git a/contrib/garnet.py b/contrib/garnet.py
index 075819e9df..4d8b9096c9 100644
--- a/contrib/garnet.py
+++ b/contrib/garnet.py
@@ -322,7 +322,7 @@ def _setup_transforms(self, n_aggregators, n_filters, n_propagate):
             else:
                 input_feature_transform = NamedDense(p, name=('FLR%d' % it))
                 output_feature_transform = NamedDense(f, name=('Fout%d' % it))
-                output_activation_transform = keras.layers.Activation(self._output_activation)
+                # output_activation_transform = keras.layers.Activation(self._output_activation)
 
             aggregator_distance = NamedDense(a, name=('S%d' % it))
 
diff --git a/docs/status.rst b/docs/status.rst
index 4ff4d33282..44881c2fb3 100644
--- a/docs/status.rst
+++ b/docs/status.rst
@@ -4,19 +4,28 @@ Status and Features
 
 Status
 ======
+======
 
 The latest version (built from ``main``) is |version|.
 The stable version (released on PyPI) is |release|.
 See the :ref:`Release Notes` section for a changelog.
+The latest version (built from ``main``) is |version|.
+The stable version (released on PyPI) is |release|.
+See the :ref:`Release Notes` section for a changelog.
 
 
 Features
 ========
 
+A list of supported ML frameworks, HLS backends, and neural network architectures, including a summary table is below.  Dependencies are given in the :doc:`Setup <setup>` page.
 A list of supported ML frameworks, HLS backends, and neural network architectures, including a summary table is below.  Dependencies are given in the :doc:`Setup <setup>` page.
 
+ML framework support:
 ML framework support:
 
+* (Q)Keras
+* PyTorch (limited)
+* (Q)ONNX (in development)
 * (Q)Keras
 * PyTorch (limited)
 * (Q)ONNX (in development)
@@ -30,6 +39,16 @@ Neural network architectures:
 
 HLS backends:
 
+* Vivado HLS
+* Intel HLS
+* Vitis HLS (experimental)
+* Fully connected NN (multilayer perceptron, MLP)
+* Convolutional NN
+* Recurrent NN (LSTM)
+* Graph NN (GarNet)
+
+HLS backends:
+
 * Vivado HLS
 * Intel HLS
 * Vitis HLS (experimental)
@@ -39,6 +58,8 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 .. list-table::
    :header-rows: 1
 
+   * - ML framework/HLS backend
+     - (Q)Keras
    * - ML framework/HLS backend
      - (Q)Keras
      - PyTorch
@@ -46,20 +67,35 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - Vivado HLS
      - Intel HLS
      - Vitis HLS
+     - (Q)ONNX
+     - Vivado HLS
+     - Intel HLS
+     - Vitis HLS
    * - MLP
      - ``supported``
      - ``limited``
      - ``in development``
+     - ``limited``
+     - ``in development``
      - ``supported``
      - ``supported``
      - ``experimental``
+   * - CNN
+     - ``experimental``
    * - CNN
      - ``supported``
      - ``limited``
+     - ``limited``
      - ``in development``
      - ``supported``
      - ``supported``
      - ``experimental``
+   * - RNN (LSTM)
+     - ``supported``
+     - ``N/A``
+     - ``supported``
+     - ``supported``
+     - ``experimental``
    * - RNN (LSTM)
      - ``supported``
      - ``N/A``
@@ -74,6 +110,16 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - ``N/A``
      - ``N/A``
      - ``N/A``
+     - ``supported``
+     - ``supported``
+     - ``N/A``
+   * - GNN (GarNet)
+     - ``supported``
+     - ``N/A``
+     - ``N/A``
+     - ``N/A``
+     - ``N/A``
+     - ``N/A``
 
 
 Other feature notes:
@@ -81,7 +127,7 @@ Other feature notes:
 * ``hls4ml`` is tested on Linux, and supports
    * Vivado HLS versions 2018.2 to 2020.1
    * Intel HLS versions 20.1 to 21.4
-   * Vitis HLS versions 2022.2 to 2024.1
+   * Vitis HLS versions 2020.2 to 2022.2 (experimentally)
 * Windows and macOS are not supported
 * BDT support has moved to the `Conifer <https://github.com/thesps/conifer>`__ package
 
@@ -90,3 +136,5 @@ Example Models
 
 We also provide and document several example ``hls4ml`` models in `this GitHub repository <https://github.com/fastmachinelearning/example-models>`_, which is included as a submodule.
 You can check it out by doing ``git submodule update --init --recursive`` from the top level directory of ``hls4ml``.
+We also provide and document several example ``hls4ml`` models in `this GitHub repository <https://github.com/fastmachinelearning/example-models>`_, which is included as a submodule.
+You can check it out by doing ``git submodule update --init --recursive`` from the top level directory of ``hls4ml``.
diff --git a/hls4ml/backends/vivado/passes/broadcast_stream.py b/hls4ml/backends/vivado/passes/broadcast_stream.py
index ec6322cf78..ed6ca55f18 100644
--- a/hls4ml/backends/vivado/passes/broadcast_stream.py
+++ b/hls4ml/backends/vivado/passes/broadcast_stream.py
@@ -1,117 +1,117 @@
-import numpy as np
-
-from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Concatenate, Layer, Merge, register_layer
-from hls4ml.model.optimizer import OptimizerPass
-
-
-class Broadcast(Layer):
-    '''Inserted between layers for broadcasting.'''
-
-    def initialize(self):
-        shape = self.attributes['target_shape']
-        if shape[0] is None:
-            shape = shape[1:]
-        dims = [f'N_SIZE_{i}_{self.index}' for i in range(1, len(shape) + 1)]
-        self.add_output_variable(shape, dims)
-
-
-broadcast_function_template = 'nnet::broadcast_stream<{input_t}, {output_t}, {config}>({input}, {output});'
-broadcast_config_template = """struct config{index} : nnet::broadcast_config {{
-    static const unsigned in_width = {in_width};
-    static const unsigned in_height = {in_height};
-    static const unsigned in_chan = {in_chan};
-    static const unsigned out_width = {out_width};
-    static const unsigned out_height = {out_height};
-    static const unsigned out_chan = {out_chan};
-}};\n"""
-broadcast_include_list = ['nnet_utils/nnet_stream.h']
-
-
-class BroadcastConfigTemplate(LayerConfigTemplate):
-    def __init__(self):
-        super().__init__(Broadcast)
-        self.template = broadcast_config_template
-
-    def format(self, node):
-        params = self._default_config_params(node)
-        params['in_height'] = node.get_input_variable().shape[0]
-        params['in_width'] = node.get_input_variable().shape[1]
-        params['in_chan'] = node.get_input_variable().shape[2]
-        params['out_height'] = node.get_output_variable().shape[0]
-        params['out_width'] = node.get_output_variable().shape[1]
-        params['out_chan'] = node.get_output_variable().shape[2]
-
-        return self.template.format(**params)
-
-
-class BroadcastFunctionTemplate(FunctionCallTemplate):
-    def __init__(self):
-        super().__init__(Broadcast, include_header=broadcast_include_list)
-        self.template = broadcast_function_template
-
-    def format(self, node):
-        params = self._default_function_params(node)
-        return self.template.format(**params)
-
-
-def register_broadcast_stream(backend):
-    # Register the layer types to the layer map
-    register_layer('Broadcast', Broadcast)
-
-    # Register the optimization passes
-    backend.register_pass('broadcast_stream', BroadcastStream)
-
-    # Register template passes
-    backend.register_template(BroadcastConfigTemplate)
-    backend.register_template(BroadcastFunctionTemplate)
-
-
-class BroadcastStream(OptimizerPass):
-    def match(self, node):
-        if isinstance(node, Merge) and not isinstance(node, Concatenate):
-            inp1 = node.get_input_variable(node.inputs[0])
-            inp2 = node.get_input_variable(node.inputs[1])
-            return inp1.shape != inp2.shape
-        else:
-            return False
-
-    def transform(self, model, node):
-        if model.config.backend.name not in ['Vivado'] or model.config.get_config_value('IOType') != 'io_stream':
-            return False
-
-        inp = [node.get_input_variable(inp_name) for inp_name in node.inputs]
-
-        if np.prod(inp[0].shape) > np.prod(inp[1].shape):
-            idx = 1
-            attrs = {'target_shape': inp[0].shape}
-        else:
-            idx = 0
-            attrs = {'target_shape': inp[1].shape}
-
-        def supported_broadcast(inp_shape, target_shape):
-            # Must be (H, W, C)
-            if not len(inp_shape) == 3:
-                return False
-            # Supported: (1, 1, C) -> (H, W, C)
-            if inp_shape[0] == inp_shape[1] == 1 and inp_shape[2] == target_shape[2]:
-                return True
-            # Supported: (H, W, 1) -> (H, W, C)
-            if inp_shape[2] == 1 and inp_shape[0] == target_shape[0] and inp_shape[1] == target_shape[1]:
-                return True
-            return False
-
-        brdcst_inp = node.inputs[idx]
-        inp_shape = node.get_input_variable(brdcst_inp).shape
-        target_shape = attrs['target_shape']
-        if not supported_broadcast(inp_shape, target_shape):
-            raise RuntimeError(
-                f'Unsupported broadcast type for stream: {inp_shape} -> {target_shape};'
-                + 'Only (1, 1, C) -> (H, W, C) and (H, W, 1) -> (H, W, C) currently supported'
-            )
-        brdcst_out = 'broadcast_' + brdcst_inp
-        brdcst_layer = model.make_node('Broadcast', brdcst_out, attrs, [brdcst_inp].copy())
-        model.insert_node(brdcst_layer, before=node, input_idx=idx)
-        node.inputs[idx] = brdcst_out
-
-        return True
+import numpy as np
+
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Concatenate, Layer, Merge, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class Broadcast(Layer):
+    '''Inserted between layers for broadcasting.'''
+
+    def initialize(self):
+        shape = self.attributes['target_shape']
+        if shape[0] is None:
+            shape = shape[1:]
+        dims = [f'N_SIZE_{i}_{self.index}' for i in range(1, len(shape) + 1)]
+        self.add_output_variable(shape, dims)
+
+
+broadcast_function_template = 'nnet::broadcast_stream<{input_t}, {output_t}, {config}>({input}, {output});'
+broadcast_config_template = """struct config{index} : nnet::broadcast_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned in_height = {in_height};
+    static const unsigned in_chan = {in_chan};
+    static const unsigned out_width = {out_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_chan = {out_chan};
+}};\n"""
+broadcast_include_list = ['nnet_utils/nnet_stream.h']
+
+
+class BroadcastConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Broadcast)
+        self.template = broadcast_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['in_height'] = node.get_input_variable().shape[0]
+        params['in_width'] = node.get_input_variable().shape[1]
+        params['in_chan'] = node.get_input_variable().shape[2]
+        params['out_height'] = node.get_output_variable().shape[0]
+        params['out_width'] = node.get_output_variable().shape[1]
+        params['out_chan'] = node.get_output_variable().shape[2]
+
+        return self.template.format(**params)
+
+
+class BroadcastFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Broadcast, include_header=broadcast_include_list)
+        self.template = broadcast_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
+
+
+def register_broadcast_stream(backend):
+    # Register the layer types to the layer map
+    register_layer('Broadcast', Broadcast)
+
+    # Register the optimization passes
+    backend.register_pass('broadcast_stream', BroadcastStream)
+
+    # Register template passes
+    backend.register_template(BroadcastConfigTemplate)
+    backend.register_template(BroadcastFunctionTemplate)
+
+
+class BroadcastStream(OptimizerPass):
+    def match(self, node):
+        if isinstance(node, Merge) and not isinstance(node, Concatenate):
+            inp1 = node.get_input_variable(node.inputs[0])
+            inp2 = node.get_input_variable(node.inputs[1])
+            return inp1.shape != inp2.shape
+        else:
+            return False
+
+    def transform(self, model, node):
+        if model.config.backend.name not in ['Vivado'] or model.config.get_config_value('IOType') != 'io_stream':
+            return False
+
+        inp = [node.get_input_variable(inp_name) for inp_name in node.inputs]
+
+        if np.prod(inp[0].shape) > np.prod(inp[1].shape):
+            idx = 1
+            attrs = {'target_shape': inp[0].shape}
+        else:
+            idx = 0
+            attrs = {'target_shape': inp[1].shape}
+
+        def supported_broadcast(inp_shape, target_shape):
+            # Must be (H, W, C)
+            if not len(inp_shape) == 3:
+                return False
+            # Supported: (1, 1, C) -> (H, W, C)
+            if inp_shape[0] == inp_shape[1] == 1 and inp_shape[2] == target_shape[2]:
+                return True
+            # Supported: (H, W, 1) -> (H, W, C)
+            if inp_shape[2] == 1 and inp_shape[0] == target_shape[0] and inp_shape[1] == target_shape[1]:
+                return True
+            return False
+
+        brdcst_inp = node.inputs[idx]
+        inp_shape = node.get_input_variable(brdcst_inp).shape
+        target_shape = attrs['target_shape']
+        if not supported_broadcast(inp_shape, target_shape):
+            raise RuntimeError(
+                f'Unsupported broadcast type for stream: {inp_shape} -> {target_shape};'
+                + 'Only (1, 1, C) -> (H, W, C) and (H, W, 1) -> (H, W, C) currently supported'
+            )
+        brdcst_out = 'broadcast_' + brdcst_inp
+        brdcst_layer = model.make_node('Broadcast', brdcst_out, attrs, [brdcst_inp].copy())
+        model.insert_node(brdcst_layer, before=node, input_idx=idx)
+        node.inputs[idx] = brdcst_out
+
+        return True
diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index 16992991ee..f1f4918cca 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -1,144 +1,149 @@
-
-from hls4ml.backends.backend import get_backend
-from hls4ml.model.layers import MultiHeadAttention
-from hls4ml.backends.template import LayerConfigTemplate, FunctionCallTemplate
-
-#dense layer template
-mult_config_template = """struct config{index}_{mNum} : nnet::dense_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_out = {n_out};
-    static const unsigned seq_len = {seq_len};
-    static const unsigned strategy = nnet::{strategy};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned n_zeros = {nzeros};
-    static const unsigned n_nonzeros = {nonzeros};
-    static const bool store_weights_in_bram = false;
-    typedef {accum_t.name} accum_t;
-    typedef {attention_output_bias_t.name} bias_t;
-    typedef {attention_output_weight_t.name} weight_t;
-    typedef ap_{index_t} index_t;
-    template<class x_T, class y_T>
-    using product = nnet::product::{product_type}<x_T, y_T>;
-}};\n"""
-
-#activation template
-softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned table_size = {table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
-    typedef {table_t.name} exp_table_t;
-    typedef {table_t.name} inv_table_t;
-    typedef {accum_t.name} accum_t;
-    static const unsigned inv_range = {inv_range};
-    static const unsigned exp_range = {exp_range};
-}};\n"""
-
-                                                                  
-mha_config_template = """struct config{index} : nnet::multiheadattention_config {{ 
-    typedef {accum_t.name} accum_t;
-    typedef {attention_output_bias_t.name} bias_t;
-    typedef {attention_output_weight_t.name} weight_t;
-    typedef {config_mult_t1} config_mult1;
-    typedef {config_mult_t2} config_mult2;
-    typedef {config_activ_t1} softmax_config1;
-
-    static const unsigned num_heads = {num_heads};
-    static const unsigned head_dim_key = {head_dim_key};
-    static const unsigned head_dim_value = {head_dim_value};
-    static const unsigned feature_dim = {feature_dim};
-    static const unsigned seq_len = {seq_len};
-
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const bool store_weights_in_bram = false;
-}};\n"""
-
-
-mha_function_template = 'nnet::multiheadattention<{input_t}, {output_t}, {config}>({input_q}, {input_kv}, {output}, {w_o}, {b_o}, {w_k}, {b_k}, {w_q}, {b_q}, {w_v}, {b_v});'
-
-mha_include_list = ['nnet_utils/nnet_multiheadattention.h']
-
-class MhaConfigTemplate(LayerConfigTemplate):
-    def __init__(self):
-        super().__init__(MultiHeadAttention)
-        self.template = mha_config_template
-        self.mult1_template = mult_config_template
-        self.mult2_template = mult_config_template
-        self.activ1_template = softmax_config_template
-    
-    def format(self, node):
-
-        params = self._default_config_params(node)
-        params['num_heads'] = node.get_attr('num_heads')
-        params['head_dim_key'] = node.get_attr('head_dim_key')
-        params['head_dim_value'] = node.get_attr('head_dim_value')
-        params['feature_dim'] = node.get_attr('feature_dim')
-        params['seq_len'] = node.get_attr('seq_len')
-        params['config_mult_t1'] = 'config{}_1'.format(node.index)
-        params['config_mult_t2'] = 'config{}_2'.format(node.index)
-        params['config_activ_t1'] = '{}_config{}'.format("softmax", node.index)
-        params['strategy'] = node.get_attr('strategy')
-        mha_config = self.template.format(**params)
-
-        mult_params1 = self._default_config_params(node)
-        mult_params1['strategy'] = 'latency'
-        mult_params1['mNum'] = '1'
-        mult_params1['n_in'] = node.get_attr('feature_dim')
-        mult_params1['n_out'] = node.get_attr('head_dim_key')
-        mult_params1['seq_len'] = 1
-        mult_params1['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('query_weight').type.precision)
-        mult_params1['reuse'] = params['reuse']
-        mult_params1['index'] = str(node.index)
-        mult_params1['nzeros'] = 0
-        mult_params1['nonzeros'] = params['feature_dim']*params['num_heads']*params['head_dim_key']
-        mult_config1 = self.mult1_template.format(**mult_params1)
-
-        mult_params2 = self._default_config_params(node)
-        mult_params2['strategy'] = 'latency'
-        mult_params2['mNum'] = '2'
-        mult_params2['n_in'] = node.get_attr('head_dim_value') * node.get_attr('num_heads')
-        mult_params2['n_out'] = node.get_attr('feature_dim')
-        mult_params2['seq_len'] = 1
-        mult_params2['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('attention_output_weight').type.precision)
-        mult_params2['reuse'] = params['reuse']
-        mult_params2['index'] = str(node.index)
-        mult_params2['nzeros'] = 0
-        mult_params2['nonzeros'] = params['feature_dim']*params['num_heads']*params['head_dim_key']
-        mult_config2 = self.mult2_template.format(**mult_params2)
-
-        act_params = self._default_config_params(node)
-        act_params['n_in'] = node.get_attr('seq_len')
-        act_params['type'] = 'softmax'
-        act_params['implementation'] = 'legacy' #in MHA: latency,stable not work， legacy works
-        act_config = self.activ1_template.format(**act_params)
-
-        return mult_config1 + '\n' + mult_config2 + '\n' + act_config + '\n' + mha_config
-
-class MhaFunctionTemplate(FunctionCallTemplate):
-    def __init__(self):
-        super().__init__(MultiHeadAttention, include_header=mha_include_list)
-        self.template = mha_function_template
-
-    def format(self, node):
-        params = {}
-        params.update(node.attributes)
-        params['config'] = 'config{}'.format(node.index)
-        params['input_t'] = node.get_input_variable().type.name
-        params['output_t'] = node.get_output_variable().type.name
-        
-        params['input_q'] = node.model.get_layer_output_variable(node.inputs[0]).name
-        params['input_kv'] = node.model.get_layer_output_variable(node.inputs[1]).name
-        params['output'] = node.get_output_variable().name
-        params['w_o'] = node.get_weights('attention_output_weight').name
-        params['b_o'] = node.get_weights('attention_output_bias').name
-        params['w_k'] = node.get_weights('key_weight').name
-        params['b_k'] = node.get_weights('key_bias').name
-        params['w_q'] = node.get_weights('query_weight').name
-        params['b_q'] = node.get_weights('query_bias').name
-        params['w_v'] = node.get_weights('value_weight').name
-        params['b_v'] = node.get_weights('value_bias').name
-
-        return self.template.format(**params)
-
+from hls4ml.backends.backend import get_backend
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import MultiHeadAttention
+
+# dense layer template
+mult_config_template = """struct config{index}_{mNum} : nnet::dense_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_out = {n_out};
+    static const unsigned seq_len = {seq_len};
+    static const unsigned strategy = nnet::{strategy};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned n_zeros = {nzeros};
+    static const unsigned n_nonzeros = {nonzeros};
+    static const bool store_weights_in_bram = false;
+    typedef {accum_t.name} accum_t;
+    typedef {attention_output_bias_t.name} bias_t;
+    typedef {attention_output_weight_t.name} weight_t;
+    typedef ap_{index_t} index_t;
+    template<class x_T, class y_T>
+    using product = nnet::product::{product_type}<x_T, y_T>;
+}};\n"""
+
+# activation template
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned table_size = {table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    typedef {table_t.name} exp_table_t;
+    typedef {table_t.name} inv_table_t;
+    typedef {accum_t.name} accum_t;
+    static const unsigned inv_range = {inv_range};
+    static const unsigned exp_range = {exp_range};
+}};\n"""
+
+
+mha_config_template = """struct config{index} : nnet::multiheadattention_config {{
+    typedef {accum_t.name} accum_t;
+    typedef {attention_output_bias_t.name} bias_t;
+    typedef {attention_output_weight_t.name} weight_t;
+    typedef {config_mult_t1} config_mult1;
+    typedef {config_mult_t2} config_mult2;
+    typedef {config_activ_t1} softmax_config1;
+
+    static const unsigned num_heads = {num_heads};
+    static const unsigned head_dim_key = {head_dim_key};
+    static const unsigned head_dim_value = {head_dim_value};
+    static const unsigned feature_dim = {feature_dim};
+    static const unsigned seq_len = {seq_len};
+
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const bool store_weights_in_bram = false;
+}};\n"""
+
+
+mha_function_template = """nnet::multiheadattention<{input_t}, {output_t}, {config}>({input_q}, {input_kv},
+                            {output}, {w_o}, {b_o}, {w_k}, {b_k}, {w_q}, {b_q}, {w_v}, {b_v});"""
+
+mha_include_list = ['nnet_utils/nnet_multiheadattention.h']
+
+
+class MhaConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(MultiHeadAttention)
+        self.template = mha_config_template
+        self.mult1_template = mult_config_template
+        self.mult2_template = mult_config_template
+        self.activ1_template = softmax_config_template
+
+    def format(self, node):
+
+        params = self._default_config_params(node)
+        params['num_heads'] = node.get_attr('num_heads')
+        params['head_dim_key'] = node.get_attr('head_dim_key')
+        params['head_dim_value'] = node.get_attr('head_dim_value')
+        params['feature_dim'] = node.get_attr('feature_dim')
+        params['seq_len'] = node.get_attr('seq_len')
+        params['config_mult_t1'] = f'config{node.index}_1'
+        params['config_mult_t2'] = f'config{node.index}_2'
+        params['config_activ_t1'] = '{}_config{}'.format("softmax", node.index)
+        params['strategy'] = node.get_attr('strategy')
+        mha_config = self.template.format(**params)
+
+        mult_params1 = self._default_config_params(node)
+        mult_params1['strategy'] = 'latency'
+        mult_params1['mNum'] = '1'
+        mult_params1['n_in'] = node.get_attr('feature_dim')
+        mult_params1['n_out'] = node.get_attr('head_dim_key')
+        mult_params1['seq_len'] = 1
+        mult_params1['product_type'] = get_backend('vivado').product_type(
+            node.get_input_variable().type.precision, node.get_weights('query_weight').type.precision
+        )
+        mult_params1['reuse'] = params['reuse']
+        mult_params1['index'] = str(node.index)
+        mult_params1['nzeros'] = 0
+        mult_params1['nonzeros'] = params['feature_dim'] * params['num_heads'] * params['head_dim_key']
+        mult_config1 = self.mult1_template.format(**mult_params1)
+
+        mult_params2 = self._default_config_params(node)
+        mult_params2['strategy'] = 'latency'
+        mult_params2['mNum'] = '2'
+        mult_params2['n_in'] = node.get_attr('head_dim_value') * node.get_attr('num_heads')
+        mult_params2['n_out'] = node.get_attr('feature_dim')
+        mult_params2['seq_len'] = 1
+        mult_params2['product_type'] = get_backend('vivado').product_type(
+            node.get_input_variable().type.precision, node.get_weights('attention_output_weight').type.precision
+        )
+        mult_params2['reuse'] = params['reuse']
+        mult_params2['index'] = str(node.index)
+        mult_params2['nzeros'] = 0
+        mult_params2['nonzeros'] = params['feature_dim'] * params['num_heads'] * params['head_dim_key']
+        mult_config2 = self.mult2_template.format(**mult_params2)
+
+        act_params = self._default_config_params(node)
+        act_params['n_in'] = node.get_attr('seq_len')
+        act_params['type'] = 'softmax'
+        act_params['implementation'] = 'legacy'  # in MHA: latency,stable not work， legacy works
+        act_config = self.activ1_template.format(**act_params)
+
+        return mult_config1 + '\n' + mult_config2 + '\n' + act_config + '\n' + mha_config
+
+
+class MhaFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(MultiHeadAttention, include_header=mha_include_list)
+        self.template = mha_function_template
+
+    def format(self, node):
+        params = {}
+        params.update(node.attributes)
+        params['config'] = f'config{node.index}'
+        params['input_t'] = node.get_input_variable().type.name
+        params['output_t'] = node.get_output_variable().type.name
+
+        params['input_q'] = node.model.get_layer_output_variable(node.inputs[0]).name
+        params['input_kv'] = node.model.get_layer_output_variable(node.inputs[1]).name
+        params['output'] = node.get_output_variable().name
+        params['w_o'] = node.get_weights('attention_output_weight').name
+        params['b_o'] = node.get_weights('attention_output_bias').name
+        params['w_k'] = node.get_weights('key_weight').name
+        params['b_k'] = node.get_weights('key_bias').name
+        params['w_q'] = node.get_weights('query_weight').name
+        params['b_q'] = node.get_weights('query_bias').name
+        params['w_v'] = node.get_weights('value_weight').name
+        params['b_v'] = node.get_weights('value_bias').name
+
+        return self.template.format(**params)
diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json
index 1279ec22d0..e59f20cc18 100644
--- a/hls4ml/backends/vivado_accelerator/supported_boards.json
+++ b/hls4ml/backends/vivado_accelerator/supported_boards.json
@@ -1,42 +1,42 @@
-{
-  "pynq-z2": {
-    "part": "xc7z020clg400-1",
-    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
-    "c_drivers": {}
-  },
-  "zcu102": {
-    "part": "xczu9eg-ffvb1156-2-e",
-    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
-    "c_drivers": {}
-  },
-  "alveo-u50": {
-    "part": "xcu50-fsvh2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u250": {
-    "part": "xcu250-figd2104-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u200": {
-    "part": "xcu200-fsgd2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u280": {
-    "part": "xcu280-fsvh2892-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  }
-}
+{
+  "pynq-z2": {
+    "part": "xc7z020clg400-1",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "zcu102": {
+    "part": "xczu9eg-ffvb1156-2-e",
+    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "alveo-u50": {
+    "part": "xcu50-fsvh2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u250": {
+    "part": "xcu250-figd2104-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u200": {
+    "part": "xcu200-fsgd2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u280": {
+    "part": "xcu280-fsvh2892-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  }
+}
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index e8da1fc102..003eb111d9 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -1,165 +1,167 @@
-from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer
-from hls4ml.model.quantizers import BinaryQuantizer, TernaryQuantizer
-from hls4ml.model.types import IntegerPrecisionType
-
-
-@keras_handler('InputLayer')
-def parse_input_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert keras_layer['class_name'] == 'InputLayer'
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    layer['input_shape'] = keras_layer['config']['batch_input_shape'][1:]
-
-    dtype = keras_layer['config']['dtype']
-    if dtype.startswith('int') or dtype.startswith('uint'):
-        layer['type_name'] = 'integer_input_t'
-        width = int(dtype[dtype.index('int') + 3 :])
-        signed = not dtype.startswith('u')
-        layer['precision'] = IntegerPrecisionType(width=width, signed=signed)
-    # elif bool, q[u]int, ...
-
-    output_shape = keras_layer['config']['batch_input_shape']
-
-    return layer, output_shape
-
-
-dense_layers = ['Dense', 'BinaryDense', 'TernaryDense']
-
-
-@keras_handler(*dense_layers)
-def parse_dense_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Dense' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    layer['weight_data'], layer['bias_data'] = get_weights_data(data_reader, layer['name'], ['kernel', 'bias'])
-    layer['n_in'] = layer['weight_data'].shape[0]
-    layer['n_out'] = layer['weight_data'].shape[1]
-    if 'Binary' in layer['class_name']:
-        layer['weight_quantizer'] = BinaryQuantizer(bits=2)
-        layer['bias_quantizer'] = BinaryQuantizer(bits=2)
-    elif 'Ternary' in layer['class_name']:
-        layer['weight_quantizer'] = TernaryQuantizer()
-        layer['bias_quantizer'] = TernaryQuantizer()
-    else:
-        layer['weight_quantizer'] = None
-        layer['bias_quantizer'] = None
-    output_shape = input_shapes[0][:]
-    output_shape[-1] = layer['n_out']
-    if len(input_shapes[0])==3:
-        layer['seq_len'] = output_shape[-2]
-    else: layer['seq_len'] = 1
-
-    return layer, output_shape
-
-
-activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'ReLU']
-
-
-@keras_handler(*activation_layers)
-def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert keras_layer['class_name'] in activation_layers
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    if layer['class_name'] != 'Activation':
-        layer['activation'] = layer['class_name']
-    if layer['class_name'] == 'LeakyReLU':
-        layer['activ_param'] = keras_layer['config'].get('alpha', 0.3)
-    elif layer['class_name'] == 'ThresholdedReLU':
-        layer['activ_param'] = keras_layer['config'].get('theta', 1.0)
-    elif layer['class_name'] == 'ELU':
-        layer['activ_param'] = keras_layer['config'].get('alpha', 1.0)
-    elif layer['class_name'] == 'ReLU':
-        layer['class_name'] = 'Activation'
-    elif layer['class_name'] == 'PReLU':
-        layer['alpha_data'] = get_weights_data(data_reader, layer['name'], 'alpha')
-
-    if layer['class_name'] == 'Activation' and layer['activation'] == 'softmax':
-        layer['class_name'] = 'Softmax'
-    if layer['class_name'] == 'Activation' and layer['activation'] == 'hard_sigmoid':
-        layer['class_name'] = 'HardActivation'
-    if layer['class_name'] == 'Softmax':
-        layer['axis'] = keras_layer['config'].get('axis', -1)
-
-    return layer, [shape for shape in input_shapes[0]]
-
-
-@keras_handler('BatchNormalization')
-def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    in_size = 1
-    for dim in input_shapes[0][1:]:
-        in_size *= dim
-    layer['n_in'] = in_size
-    layer['n_out'] = layer['n_in']
-    if len(input_shapes[0]) == 2:
-        layer['n_filt'] = -1
-    elif len(input_shapes[0]) == 3:
-        layer['n_filt'] = input_shapes[0][2]
-    elif len(input_shapes[0]) == 4:
-        layer['n_filt'] = input_shapes[0][3]
-
-    layer['use_gamma'] = keras_layer['config']['scale']
-    if layer['use_gamma']:
-        layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
-    else:
-        layer['gamma_data'] = 1
-
-    layer['use_beta'] = keras_layer['config']['center']
-    if layer['use_beta']:
-        layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
-    else:
-        layer['beta_data'] = 0
-
-    layer['mean_data'], layer['variance_data'] = get_weights_data(
-        data_reader, layer['name'], ['moving_mean', 'moving_variance']
-    )
-
-    return layer, [shape for shape in input_shapes[0]]
-
-
-@keras_handler('LayerNormalization')
-def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    assert('LayerNormalization' in keras_layer['class_name'])
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    in_size = 1
-    for dim in input_shapes[0][1:]:
-        in_size *= dim
-
-    layer['axis'] = keras_layer['config']['axis'] if (keras_layer['config']['axis'][0]==2) else False
-    if layer['axis'] is False:
-        raise Exception('assigning the axis is not currently supported by hls4ml, only axis 2 is supported')
-
-    if not((len(input_shapes[0])) == 3 ):
-        raise Exception('input size is not currently supported by hls4ml, only dim3 is supported')
-    if len(input_shapes[0])==3:
-        layer['seq_len'] = input_shapes[0][-2]
-    else: layer['seq_len'] = 1
-    layer['n_in'] = in_size
-    layer['n_out'] = layer['n_in']
-
-    return layer, [shape for shape in input_shapes[0]]
-
-
-@keras_handler('Embedding')
-def parse_embedding_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Embedding' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    layer['n_in'] = input_shapes[0][1]
-    layer['vocab_size'] = keras_layer['config']['input_dim']
-    layer['n_out'] = keras_layer['config']['output_dim']
-
-    layer['embeddings_data'] = get_weights_data(data_reader, layer['name'], 'embeddings')
-
-    output_shape = input_shapes[0] + [layer['n_out']]
-
-    return layer, output_shape
+from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer
+from hls4ml.model.quantizers import BinaryQuantizer, TernaryQuantizer
+from hls4ml.model.types import IntegerPrecisionType
+
+
+@keras_handler('InputLayer')
+def parse_input_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert keras_layer['class_name'] == 'InputLayer'
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    layer['input_shape'] = keras_layer['config']['batch_input_shape'][1:]
+
+    dtype = keras_layer['config']['dtype']
+    if dtype.startswith('int') or dtype.startswith('uint'):
+        layer['type_name'] = 'integer_input_t'
+        width = int(dtype[dtype.index('int') + 3 :])
+        signed = not dtype.startswith('u')
+        layer['precision'] = IntegerPrecisionType(width=width, signed=signed)
+    # elif bool, q[u]int, ...
+
+    output_shape = keras_layer['config']['batch_input_shape']
+
+    return layer, output_shape
+
+
+dense_layers = ['Dense', 'BinaryDense', 'TernaryDense']
+
+
+@keras_handler(*dense_layers)
+def parse_dense_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Dense' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    layer['weight_data'], layer['bias_data'] = get_weights_data(data_reader, layer['name'], ['kernel', 'bias'])
+    layer['n_in'] = layer['weight_data'].shape[0]
+    layer['n_out'] = layer['weight_data'].shape[1]
+    if 'Binary' in layer['class_name']:
+        layer['weight_quantizer'] = BinaryQuantizer(bits=2)
+        layer['bias_quantizer'] = BinaryQuantizer(bits=2)
+    elif 'Ternary' in layer['class_name']:
+        layer['weight_quantizer'] = TernaryQuantizer()
+        layer['bias_quantizer'] = TernaryQuantizer()
+    else:
+        layer['weight_quantizer'] = None
+        layer['bias_quantizer'] = None
+    output_shape = input_shapes[0][:]
+    output_shape[-1] = layer['n_out']
+    if len(input_shapes[0]) == 3:
+        layer['seq_len'] = output_shape[-2]
+    else:
+        layer['seq_len'] = 1
+
+    return layer, output_shape
+
+
+activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'ReLU']
+
+
+@keras_handler(*activation_layers)
+def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert keras_layer['class_name'] in activation_layers
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    if layer['class_name'] != 'Activation':
+        layer['activation'] = layer['class_name']
+    if layer['class_name'] == 'LeakyReLU':
+        layer['activ_param'] = keras_layer['config'].get('alpha', 0.3)
+    elif layer['class_name'] == 'ThresholdedReLU':
+        layer['activ_param'] = keras_layer['config'].get('theta', 1.0)
+    elif layer['class_name'] == 'ELU':
+        layer['activ_param'] = keras_layer['config'].get('alpha', 1.0)
+    elif layer['class_name'] == 'ReLU':
+        layer['class_name'] = 'Activation'
+    elif layer['class_name'] == 'PReLU':
+        layer['alpha_data'] = get_weights_data(data_reader, layer['name'], 'alpha')
+
+    if layer['class_name'] == 'Activation' and layer['activation'] == 'softmax':
+        layer['class_name'] = 'Softmax'
+    if layer['class_name'] == 'Activation' and layer['activation'] == 'hard_sigmoid':
+        layer['class_name'] = 'HardActivation'
+    if layer['class_name'] == 'Softmax':
+        layer['axis'] = keras_layer['config'].get('axis', -1)
+
+    return layer, [shape for shape in input_shapes[0]]
+
+
+@keras_handler('BatchNormalization')
+def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    in_size = 1
+    for dim in input_shapes[0][1:]:
+        in_size *= dim
+    layer['n_in'] = in_size
+    layer['n_out'] = layer['n_in']
+    if len(input_shapes[0]) == 2:
+        layer['n_filt'] = -1
+    elif len(input_shapes[0]) == 3:
+        layer['n_filt'] = input_shapes[0][2]
+    elif len(input_shapes[0]) == 4:
+        layer['n_filt'] = input_shapes[0][3]
+
+    layer['use_gamma'] = keras_layer['config']['scale']
+    if layer['use_gamma']:
+        layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
+    else:
+        layer['gamma_data'] = 1
+
+    layer['use_beta'] = keras_layer['config']['center']
+    if layer['use_beta']:
+        layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
+    else:
+        layer['beta_data'] = 0
+
+    layer['mean_data'], layer['variance_data'] = get_weights_data(
+        data_reader, layer['name'], ['moving_mean', 'moving_variance']
+    )
+
+    return layer, [shape for shape in input_shapes[0]]
+
+
+@keras_handler('LayerNormalization')
+def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    assert 'LayerNormalization' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    in_size = 1
+    for dim in input_shapes[0][1:]:
+        in_size *= dim
+
+    layer['axis'] = keras_layer['config']['axis'] if (keras_layer['config']['axis'][0] == 2) else False
+    if layer['axis'] is False:
+        raise Exception('assigning the axis is not currently supported by hls4ml, only axis 2 is supported')
+
+    if not ((len(input_shapes[0])) == 3):
+        raise Exception('input size is not currently supported by hls4ml, only dim3 is supported')
+    if len(input_shapes[0]) == 3:
+        layer['seq_len'] = input_shapes[0][-2]
+    else:
+        layer['seq_len'] = 1
+    layer['n_in'] = in_size
+    layer['n_out'] = layer['n_in']
+
+    return layer, [shape for shape in input_shapes[0]]
+
+
+@keras_handler('Embedding')
+def parse_embedding_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Embedding' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    layer['n_in'] = input_shapes[0][1]
+    layer['vocab_size'] = keras_layer['config']['input_dim']
+    layer['n_out'] = keras_layer['config']['output_dim']
+
+    layer['embeddings_data'] = get_weights_data(data_reader, layer['name'], 'embeddings')
+
+    output_shape = input_shapes[0] + [layer['n_out']]
+
+    return layer, output_shape
diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
index 2b1e6322ce..6af8114006 100644
--- a/hls4ml/converters/keras/multiheadattention.py
+++ b/hls4ml/converters/keras/multiheadattention.py
@@ -1,38 +1,40 @@
-from hls4ml.converters.keras_to_hls import parse_default_keras_layer
-from hls4ml.converters.keras_to_hls import keras_handler
-
-@keras_handler('MultiHeadAttention')
-def parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    # assume input_shapes is: [[None, seq, dim]]
-    assert('MultiHeadAttention' in keras_layer['class_name'])
-    assert(input_shapes[0]==keras_layer['config']['query_shape'])
-    
-    layer = parse_default_keras_layer(keras_layer, input_names)
-    
-    layer['num_heads'] = keras_layer['config']['num_heads']
-    layer['head_dim_key'] = keras_layer['config']['key_dim']
-    layer['head_dim_value'] = keras_layer['config']['value_dim']
-    layer['query_shape'] = keras_layer['config']['query_shape']
-    layer['key_shape'] = keras_layer['config']['key_shape']
-    layer['value_shape'] = keras_layer['config']['value_shape']
-    layer['feature_dim'] = layer['query_shape'][-1]
-    layer['seq_len'] = layer['query_shape'][-2]
-
-    if keras_layer['config']['output_shape']: 
-        # output_shape = keras_layer['config']['output_shape']
-        # output_shape = (layer['query_shape'][:2]).extend(out_shape)
-        raise Exception('hls4ml does not support a defined output shape, the output shape must equal to the query shape')
-    else:  # by default output shape in config is False, and thus select the output shape equal query shape
-        output_shape = layer['query_shape']
-    
-    layer['attention_axes'] = keras_layer['config']['attention_axes'] if (keras_layer['config']['attention_axes'][0]==1) else False
-    if layer['attention_axes'] is False: 
-        raise Exception('assigning the attention_axe is not currently supported by hls4ml')
-
-    if not((len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3):
-        raise Exception('muti-dimension of feature dim is not currently supported by hls4ml')
-
-    attn_scores_rank = 4
-    layer['softmax_axis'] = list(range(attn_scores_rank - len(layer['attention_axes']), attn_scores_rank ))
-
-    return layer, output_shape
\ No newline at end of file
+from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
+
+
+@keras_handler('MultiHeadAttention')
+def parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    # assume input_shapes is: [[None, seq, dim]]
+    assert 'MultiHeadAttention' in keras_layer['class_name']
+    assert input_shapes[0] == keras_layer['config']['query_shape']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    layer['num_heads'] = keras_layer['config']['num_heads']
+    layer['head_dim_key'] = keras_layer['config']['key_dim']
+    layer['head_dim_value'] = keras_layer['config']['value_dim']
+    layer['query_shape'] = keras_layer['config']['query_shape']
+    layer['key_shape'] = keras_layer['config']['key_shape']
+    layer['value_shape'] = keras_layer['config']['value_shape']
+    layer['feature_dim'] = layer['query_shape'][-1]
+    layer['seq_len'] = layer['query_shape'][-2]
+
+    if keras_layer['config']['output_shape']:
+        # output_shape = keras_layer['config']['output_shape']
+        # output_shape = (layer['query_shape'][:2]).extend(out_shape)
+        raise Exception('hls4ml does not support a defined output shape, the output shape must equal to the query shape')
+    else:  # by default output shape in config is False, and thus select the output shape equal query shape
+        output_shape = layer['query_shape']
+
+    layer['attention_axes'] = (
+        keras_layer['config']['attention_axes'] if (keras_layer['config']['attention_axes'][0] == 1) else False
+    )
+    if layer['attention_axes'] is False:
+        raise Exception('assigning the attention_axe is not currently supported by hls4ml')
+
+    if not ((len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3):
+        raise Exception('muti-dimension of feature dim is not currently supported by hls4ml')
+
+    attn_scores_rank = 4
+    layer['softmax_axis'] = list(range(attn_scores_rank - len(layer['attention_axes']), attn_scores_rank))
+
+    return layer, output_shape
diff --git a/hls4ml/converters/keras/pooling.py b/hls4ml/converters/keras/pooling.py
index 14d6a9236a..b1030168b4 100644
--- a/hls4ml/converters/keras/pooling.py
+++ b/hls4ml/converters/keras/pooling.py
@@ -1,91 +1,91 @@
-from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
-from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d, parse_data_format
-
-pooling_layers = ['MaxPooling1D', 'MaxPooling2D', 'AveragePooling1D', 'AveragePooling2D']
-
-
-@keras_handler(*pooling_layers)
-def parse_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Pooling' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    if int(layer['class_name'][-2]) == 1:
-        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        layer['pool_width'] = keras_layer['config']['pool_size'][0]
-        layer['stride_width'] = keras_layer['config']['strides'][0]
-
-        (layer['n_out'], layer['pad_left'], layer['pad_right']) = compute_padding_1d(
-            keras_layer['config']['padding'], layer['n_in'], layer['stride_width'], layer['pool_width']
-        )
-
-        if layer['data_format'] == 'channels_last':
-            output_shape = [input_shapes[0][0], layer['n_out'], layer['n_filt']]
-        elif layer['data_format'] == 'channels_first':
-            output_shape = [input_shapes[0][0], layer['n_filt'], layer['n_out']]
-    elif int(layer['class_name'][-2]) == 2:
-        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        layer['stride_height'] = keras_layer['config']['strides'][0]
-        layer['stride_width'] = keras_layer['config']['strides'][1]
-        layer['pool_height'] = keras_layer['config']['pool_size'][0]
-        layer['pool_width'] = keras_layer['config']['pool_size'][1]
-
-        (
-            layer['out_height'],
-            layer['out_width'],
-            layer['pad_top'],
-            layer['pad_bottom'],
-            layer['pad_left'],
-            layer['pad_right'],
-        ) = compute_padding_2d(
-            keras_layer['config']['padding'],
-            layer['in_height'],
-            layer['in_width'],
-            layer['stride_height'],
-            layer['stride_width'],
-            layer['pool_height'],
-            layer['pool_width'],
-        )
-
-        if layer['data_format'] == 'channels_last':
-            output_shape = [input_shapes[0][0], layer['out_height'], layer['out_width'], layer['n_filt']]
-        elif layer['data_format'] == 'channels_first':
-            output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']]
-
-    return layer, output_shape
-
-
-global_pooling_layers = ['GlobalMaxPooling1D', 'GlobalMaxPooling2D', 'GlobalAveragePooling1D', 'GlobalAveragePooling2D']
-
-
-@keras_handler(*global_pooling_layers)
-def parse_global_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Pooling' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-    layer['keepdims'] = keras_layer['config']['keepdims']
-
-    if int(layer['class_name'][-2]) == 1:
-        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        if layer['keepdims']:
-            if layer['data_format'] == 'channels_last':
-                output_shape = [input_shapes[0][0], 1, layer['n_filt']]
-            elif layer['data_format'] == 'channels_first':
-                output_shape = [input_shapes[0][0], layer['n_filt'], 1]
-        else:
-            output_shape = [input_shapes[0][0], layer['n_filt']]
-    elif int(layer['class_name'][-2]) == 2:
-        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        if layer['keepdims']:
-            if layer['data_format'] == 'channels_last':
-                output_shape = [input_shapes[0][0], 1, 1, layer['n_filt']]
-            elif layer['data_format'] == 'channels_first':
-                output_shape = [input_shapes[0][0], layer['n_filt'], 1, 1]
-        else:
-            output_shape = [input_shapes[0][0], layer['n_filt']]
-
-    return layer, output_shape
+from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
+from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d, parse_data_format
+
+pooling_layers = ['MaxPooling1D', 'MaxPooling2D', 'AveragePooling1D', 'AveragePooling2D']
+
+
+@keras_handler(*pooling_layers)
+def parse_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Pooling' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    if int(layer['class_name'][-2]) == 1:
+        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        layer['pool_width'] = keras_layer['config']['pool_size'][0]
+        layer['stride_width'] = keras_layer['config']['strides'][0]
+
+        (layer['n_out'], layer['pad_left'], layer['pad_right']) = compute_padding_1d(
+            keras_layer['config']['padding'], layer['n_in'], layer['stride_width'], layer['pool_width']
+        )
+
+        if layer['data_format'] == 'channels_last':
+            output_shape = [input_shapes[0][0], layer['n_out'], layer['n_filt']]
+        elif layer['data_format'] == 'channels_first':
+            output_shape = [input_shapes[0][0], layer['n_filt'], layer['n_out']]
+    elif int(layer['class_name'][-2]) == 2:
+        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        layer['stride_height'] = keras_layer['config']['strides'][0]
+        layer['stride_width'] = keras_layer['config']['strides'][1]
+        layer['pool_height'] = keras_layer['config']['pool_size'][0]
+        layer['pool_width'] = keras_layer['config']['pool_size'][1]
+
+        (
+            layer['out_height'],
+            layer['out_width'],
+            layer['pad_top'],
+            layer['pad_bottom'],
+            layer['pad_left'],
+            layer['pad_right'],
+        ) = compute_padding_2d(
+            keras_layer['config']['padding'],
+            layer['in_height'],
+            layer['in_width'],
+            layer['stride_height'],
+            layer['stride_width'],
+            layer['pool_height'],
+            layer['pool_width'],
+        )
+
+        if layer['data_format'] == 'channels_last':
+            output_shape = [input_shapes[0][0], layer['out_height'], layer['out_width'], layer['n_filt']]
+        elif layer['data_format'] == 'channels_first':
+            output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']]
+
+    return layer, output_shape
+
+
+global_pooling_layers = ['GlobalMaxPooling1D', 'GlobalMaxPooling2D', 'GlobalAveragePooling1D', 'GlobalAveragePooling2D']
+
+
+@keras_handler(*global_pooling_layers)
+def parse_global_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Pooling' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+    layer['keepdims'] = keras_layer['config']['keepdims']
+
+    if int(layer['class_name'][-2]) == 1:
+        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        if layer['keepdims']:
+            if layer['data_format'] == 'channels_last':
+                output_shape = [input_shapes[0][0], 1, layer['n_filt']]
+            elif layer['data_format'] == 'channels_first':
+                output_shape = [input_shapes[0][0], layer['n_filt'], 1]
+        else:
+            output_shape = [input_shapes[0][0], layer['n_filt']]
+    elif int(layer['class_name'][-2]) == 2:
+        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        if layer['keepdims']:
+            if layer['data_format'] == 'channels_last':
+                output_shape = [input_shapes[0][0], 1, 1, layer['n_filt']]
+            elif layer['data_format'] == 'channels_first':
+                output_shape = [input_shapes[0][0], layer['n_filt'], 1, 1]
+        else:
+            output_shape = [input_shapes[0][0], layer['n_filt']]
+
+    return layer, output_shape
diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py
index 7357d95aed..d101f7972f 100644
--- a/hls4ml/converters/keras/qkeras.py
+++ b/hls4ml/converters/keras/qkeras.py
@@ -1,7 +1,8 @@
 from qkeras.quantizers import get_quantizer
 
 from hls4ml.converters.keras.convolution import parse_conv1d_layer, parse_conv2d_layer
-from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer
+from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer, parse_layernorm_layer
+from hls4ml.converters.keras.multiheadattention import parse_mutiheadattention_layer
 from hls4ml.converters.keras.recurrent import parse_rnn_layer
 from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
 from hls4ml.model.quantizers import QKerasBinaryQuantizer, QKerasPO2Quantizer, QKerasQuantizer
@@ -98,6 +99,7 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
         'quantized_bits',
         'binary',
         'ternary',
+        'quantized_softmax',
     ]
 
     layer = parse_default_keras_layer(keras_layer, input_names)
@@ -157,7 +159,10 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
     else:
         layer['class_name'] = 'Activation'
         layer['activation'] = activation_config['class_name'].replace('quantized_', '')
-
+    if activation_config['class_name'] == 'quantized_softmax':
+        # activation_config['class_name'] = 'softmax'
+        layer['class_name'] = 'Softmax'
+        layer['axis'] = keras_layer['config'].get('axis', -1)
     layer['activation_quantizer'] = activation_config
     return layer, [shape for shape in input_shapes[0]]
 
@@ -182,3 +187,31 @@ def parse_qconv2dbatchnorm_layer(keras_layer, input_names, input_shapes, data_re
     temp_shape = intermediate_shape
     batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader)
     return {**conv_layer, **batch_layer}, out_shape
+
+
+@keras_handler('QMultiHeadAttention')
+def parse_qmultiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    assert 'QMultiHeadAttention' in keras_layer['class_name']
+    assert input_shapes[0] == keras_layer['config']['query_shape']
+
+    layer, output_shape = parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config)
+
+    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
+    if keras_layer['config']['bias_quantizer'] is not None:
+        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
+    else:
+        layer['bias_quantizer'] = None
+
+    return layer, output_shape
+
+
+@keras_handler('QLayerNormalization')
+def parse_qlayernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    layer, output_shape = parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config)
+
+    layer['mean_quantizer'] = get_quantizer_from_config(keras_layer, 'mean')
+    layer['variance_quantizer'] = get_quantizer_from_config(keras_layer, 'variance')
+    layer['beta_quantizer'] = get_quantizer_from_config(keras_layer, 'beta')
+    layer['gamma_quantizer'] = get_quantizer_from_config(keras_layer, 'gamma')
+
+    return layer, output_shape
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index c501a31d95..4d008dba9f 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -1,341 +1,343 @@
-import json
-
-import h5py
-
-from hls4ml.model import ModelGraph
-
-MAXMULT = 4096
-
-
-class KerasReader:
-    def get_weights_data(self, layer_name, var_name):
-        raise NotImplementedError
-
-
-class KerasFileReader(KerasReader):
-    def __init__(self, config):
-        self.config = config
-        self.h5file = h5py.File(config['KerasH5'], mode='r')
-
-    def __del__(self):
-        if self.h5file:
-            self.h5file.close()
-
-    def _find_data(self, layer_name, var_name):
-        def h5_visitor_func(name):
-            if var_name in name:
-                return name
-
-        if 'model_weights' in list(self.h5file.keys()):  # h5 file comes from model.save()
-            layer_path = f'model_weights/{layer_name}'
-        else:
-            layer_path = layer_name
-
-        data_path = self.h5file[layer_path].visit(h5_visitor_func)
-        if data_path:
-            return self.h5file[f'/{layer_path}/{data_path}']
-        else:
-            return None
-
-    def get_weights_data(self, layer_name, var_name):
-        data = self._find_data(layer_name, var_name)
-        if data:
-            return data[()]
-        else:
-            return None
-
-
-class KerasNestedFileReader(KerasFileReader):
-    def __init__(self, data_reader, nested_path):
-        super().__init__(data_reader.config)
-        self.nested_path = nested_path
-
-    def _find_data(self, layer_name, var_name):
-        def h5_visitor_func(name):
-            if var_name in name:
-                return name
-
-        layer_path = f'model_weights/{self.nested_path}/{layer_name}'
-
-        data_path = self.h5file[layer_path].visit(h5_visitor_func)
-        if data_path:
-            return self.h5file[f'/{layer_path}/{data_path}']
-        else:
-            return None
-
-
-class KerasModelReader(KerasReader):
-    def __init__(self, keras_model):
-        self.model = keras_model
-
-    def get_weights_data(self, layer_name, var_name):
-        layer = self.model.get_layer(layer_name)
-        for i, w in enumerate(layer.weights):
-            if var_name in w.name:
-                try:
-                    return w.numpy()  # TF 2.x
-                except Exception:
-                    return layer.get_weights()[i]  # TF 1.x
-
-        return None
-
-
-def get_weights_data(data_reader, layer_name, var_name):
-    if not isinstance(var_name, (list, tuple)):
-        var_name = [var_name]
-
-    data = [data_reader.get_weights_data(layer_name, var) for var in var_name]
-
-    if len(data) == 1:
-        return data[0]
-    else:
-        return (*data,)
-
-
-layer_handlers = {}
-
-
-def register_keras_layer_handler(layer_cname, handler_func):
-    """Register a handler function for the given layer class name.
-
-    The handler function should have the following signature:
-        parse_func(keras_layer, input_names, input_shapes, data_reader, config):
-
-    Args:
-        layer_cname (str): The name of Keras layer (the 'class_name' property in the layer's config)
-        handler_func (callable): The handler function
-
-    Raises:
-        Exception: If the layer class has already been registered.
-    """
-    if layer_cname in layer_handlers:
-        raise Exception(f'Layer {layer_cname} already registered')
-    else:
-        layer_handlers[layer_cname] = handler_func
-
-
-def get_supported_keras_layers():
-    """Returns the list of Keras layers that the converter can parse.
-
-    The returned list contains all Keras layers that can be parsed into the hls4ml internal representation. Support for
-    computation of these layers may vary across hls4ml backends and conversion configuration.
-
-    Returns:
-        list: The names of supported Keras layers.
-    """
-    return list(layer_handlers.keys())
-
-
-def keras_handler(*args):
-    def decorator(function):
-        function.handles = [arg for arg in args]
-        return function
-
-    return decorator
-
-
-def parse_default_keras_layer(keras_layer, input_names):
-    layer = {}
-
-    # Extract name for finding weights and biases
-    layer['name'] = keras_layer['config']['name']
-    layer['class_name'] = keras_layer['class_name']
-    if input_names is not None:
-        layer['inputs'] = input_names
-
-    layer['data_format'] = keras_layer['config'].get('data_format', 'channels_last')
-
-    if 'activation' in keras_layer['config']:
-        layer['activation'] = keras_layer['config']['activation']
-    if 'epsilon' in keras_layer['config']:
-        layer['epsilon'] = keras_layer['config']['epsilon']
-    if 'use_bias' in keras_layer['config']:
-        layer['use_bias'] = keras_layer['config']['use_bias']
-
-    return layer
-
-
-def get_model_arch(config):
-    if 'KerasModel' in config:
-        # Model instance passed in config from API
-        keras_model = config['KerasModel']
-        if isinstance(keras_model, str):
-            from tensorflow.keras.models import load_model
-
-            keras_model = load_model(keras_model)
-        model_arch = json.loads(keras_model.to_json())
-        reader = KerasModelReader(keras_model)
-    elif 'KerasJson' in config:
-        # Extract model architecture from json
-        with open(config['KerasJson']) as json_file:
-            model_arch = json.load(json_file)
-        reader = KerasFileReader(config)
-    elif 'KerasH5' in config:
-        # Model arch and weights are in H5 file (from model.save() function)
-        with h5py.File(config['KerasH5'], mode='r') as h5file:
-            # Load the configuration from h5 using json's decode
-            model_arch = h5file.attrs.get('model_config')
-            if model_arch is None:
-                raise ValueError('No model found in config file.')
-            else:
-                # model_arch is string by default since h5py 3.0.0, keeping this condition for compatibility.
-                if isinstance(model_arch, bytes):
-                    model_arch = model_arch.decode('utf-8')
-                model_arch = json.loads(model_arch)
-        reader = KerasFileReader(config)
-    else:
-        raise ValueError('No model found in config file.')
-
-    return model_arch, reader
-
-
-def parse_keras_model(model_arch, reader):
-    # This is a list of dictionaries to hold all the layer info we need to generate HLS
-    layer_list = []
-
-    # Define layers to skip for conversion to HLS
-    skip_layers = ['Dropout']
-    # Activation layers
-    activation_layers = [
-        'Activation',
-        'LeakyReLU',
-        'ThresholdedReLU',
-        'ELU',
-        'PReLU',
-        'Softmax',
-        'TernaryTanh',
-        'HardActivation',
-        'UnaryLUT',
-        'HGQ>UnaryLUT',
-    ]
-    # Recurrent layers
-    recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU']
-    # All supported layers
-    supported_layers = get_supported_keras_layers() + skip_layers
-
-    # Map inputs of skipped and split (activation) layers
-    inputs_map = {}
-
-    # Loop through layers
-    layer_counter = 0
-
-    input_layers = None
-    output_layers = None
-
-    layer_config = None
-    if model_arch['class_name'] == 'Sequential':
-        print('Interpreting Sequential')
-        layer_config = model_arch['config']
-        if 'layers' in layer_config:  # Newer Keras versions have 'layers' in 'config' key
-            layer_config = layer_config['layers']
-        # Sequential doesn't have InputLayer in TF < 2.3 (Keras 2.4.0)
-        if layer_config[0]['class_name'] != 'InputLayer':
-            input_layer = {}
-            input_layer['name'] = 'input1'
-            input_layer['class_name'] = 'InputLayer'
-            input_layer['input_shape'] = layer_config[0]['config']['batch_input_shape'][1:]
-            layer_list.append(input_layer)
-            print('Input shape:', input_layer['input_shape'])
-    elif model_arch['class_name'] in ['Model', 'Functional']:  # TF >= 2.3 calls it 'Functional' API
-        print('Interpreting Model')
-        layer_config = model_arch['config']['layers']
-        input_layers = [inp[0] for inp in model_arch['config']['input_layers']]
-        output_layers = [out[0] for out in model_arch['config']['output_layers']]
-
-    # Get input shape and check for unsupported layer type
-    for keras_layer in layer_config:
-        if keras_layer['class_name'] not in supported_layers:
-            raise Exception('ERROR: Unsupported layer type: {}'.format(keras_layer['class_name']))
-
-    output_shapes = {}
-    output_shape = None
-
-    print('Topology:')
-    for keras_layer in layer_config:
-        if 'batch_input_shape' in keras_layer['config']:
-            if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
-                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
-            else:
-                input_shapes = [keras_layer['config']['batch_input_shape']]
-        else:
-            if 'inbound_nodes' in keras_layer:
-                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
-            else:
-                # Sequential model, so output_shape from the previous layer is still valid
-                input_shapes = [output_shape]
-
-        keras_class = keras_layer['class_name']
-
-        if keras_class in skip_layers:
-            if 'inbound_nodes' in keras_layer:
-                name = keras_layer['config']['name']
-                # Currently supported skipped layers have only one input
-                parent_input = keras_layer['inbound_nodes'][0][0][0]
-                # Skipped layers can follow each other (e.g., Dropout -> Flatten)
-                inputs_map[name] = inputs_map.get(parent_input, parent_input)
-
-            output_shapes[keras_layer['config']['name']] = input_shapes[0]
-
-            continue
-
-        if keras_class in supported_layers:
-            layer_counter = layer_counter + 1
-
-        # Extract inbound nodes
-        if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
-            input_names = [ inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0] ]
-            if keras_layer['inbound_nodes'][0][0][-1]: # multi_head_attention  has inbound: [[['input_3', 0, 0, {'value': ['dense_3', 0, 0]}]]]
-                inputname2 = list(keras_layer['inbound_nodes'][0][0][-1].values())
-                input_names+=[inp[0] for inp in inputname2]
-        else:
-            input_names = None
-
-        layer, output_shape = layer_handlers[keras_class](keras_layer, input_names, input_shapes, reader)
-
-        print(
-            'Layer name: {}, layer type: {}, input shapes: {}, output shape: {}'.format(
-                layer['name'], layer['class_name'], input_shapes, output_shape
-            )
-        )
-        layer_list.append(layer)
-        if 'activation' in layer and layer['class_name'] not in activation_layers + recurrent_layers:  # + qkeras_layers:
-            act_layer = {}
-            # Workaround for QKeras activations passed as an argument
-            if isinstance(layer['activation'], dict):
-                act_details = layer['activation']
-                act_layer['class_name'] = 'QActivation'
-                act_layer['config'] = {
-                    'name': layer['name'] + '_' + act_details['class_name'],
-                    'activation': act_details,
-                }
-                act_layer, output_shape = layer_handlers['QActivation'](act_layer, None, [output_shape], reader)
-            else:
-                act_layer['name'] = layer['name'] + '_' + layer['activation']
-                act_layer['activation'] = layer['activation']
-                if 'activ_param' in layer:
-                    act_layer['activ_param'] = layer['activ_param']
-                    act_layer['class_name'] = layer['activation']
-                elif layer['activation'] == 'softmax':
-                    act_layer['class_name'] = 'Softmax'
-                    act_layer['axis'] = -1
-                else:
-                    act_layer['class_name'] = 'Activation'
-            inputs_map[layer['name']] = act_layer['name']
-            if output_layers is not None and layer['name'] in output_layers:
-                output_layers = [act_layer['name'] if name == layer['name'] else name for name in output_layers]
-            output_shapes[act_layer['name']] = output_shape
-            layer_list.append(act_layer)
-
-        assert output_shape is not None
-
-        output_shapes[layer['name']] = output_shape
-
-    return layer_list, input_layers, output_layers, output_shapes
-
-
-def keras_to_hls(config):
-    model_arch, reader = get_model_arch(config)
-    layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
-    print('Creating HLS model')
-    hls_model = ModelGraph(config, reader, layer_list, input_layers, output_layers)
-    return hls_model
+import json
+
+import h5py
+
+from hls4ml.model import ModelGraph
+
+MAXMULT = 4096
+
+
+class KerasReader:
+    def get_weights_data(self, layer_name, var_name):
+        raise NotImplementedError
+
+
+class KerasFileReader(KerasReader):
+    def __init__(self, config):
+        self.config = config
+        self.h5file = h5py.File(config['KerasH5'], mode='r')
+
+    def __del__(self):
+        if self.h5file:
+            self.h5file.close()
+
+    def _find_data(self, layer_name, var_name):
+        def h5_visitor_func(name):
+            if var_name in name:
+                return name
+
+        if 'model_weights' in list(self.h5file.keys()):  # h5 file comes from model.save()
+            layer_path = f'model_weights/{layer_name}'
+        else:
+            layer_path = layer_name
+
+        data_path = self.h5file[layer_path].visit(h5_visitor_func)
+        if data_path:
+            return self.h5file[f'/{layer_path}/{data_path}']
+        else:
+            return None
+
+    def get_weights_data(self, layer_name, var_name):
+        data = self._find_data(layer_name, var_name)
+        if data:
+            return data[()]
+        else:
+            return None
+
+
+class KerasNestedFileReader(KerasFileReader):
+    def __init__(self, data_reader, nested_path):
+        super().__init__(data_reader.config)
+        self.nested_path = nested_path
+
+    def _find_data(self, layer_name, var_name):
+        def h5_visitor_func(name):
+            if var_name in name:
+                return name
+
+        layer_path = f'model_weights/{self.nested_path}/{layer_name}'
+
+        data_path = self.h5file[layer_path].visit(h5_visitor_func)
+        if data_path:
+            return self.h5file[f'/{layer_path}/{data_path}']
+        else:
+            return None
+
+
+class KerasModelReader(KerasReader):
+    def __init__(self, keras_model):
+        self.model = keras_model
+
+    def get_weights_data(self, layer_name, var_name):
+        layer = self.model.get_layer(layer_name)
+        for i, w in enumerate(layer.weights):
+            if var_name in w.name:
+                try:
+                    return w.numpy()  # TF 2.x
+                except Exception:
+                    return layer.get_weights()[i]  # TF 1.x
+
+        return None
+
+
+def get_weights_data(data_reader, layer_name, var_name):
+    if not isinstance(var_name, (list, tuple)):
+        var_name = [var_name]
+
+    data = [data_reader.get_weights_data(layer_name, var) for var in var_name]
+
+    if len(data) == 1:
+        return data[0]
+    else:
+        return (*data,)
+
+
+layer_handlers = {}
+
+
+def register_keras_layer_handler(layer_cname, handler_func):
+    """Register a handler function for the given layer class name.
+
+    The handler function should have the following signature:
+        parse_func(keras_layer, input_names, input_shapes, data_reader, config):
+
+    Args:
+        layer_cname (str): The name of Keras layer (the 'class_name' property in the layer's config)
+        handler_func (callable): The handler function
+
+    Raises:
+        Exception: If the layer class has already been registered.
+    """
+    if layer_cname in layer_handlers:
+        raise Exception(f'Layer {layer_cname} already registered')
+    else:
+        layer_handlers[layer_cname] = handler_func
+
+
+def get_supported_keras_layers():
+    """Returns the list of Keras layers that the converter can parse.
+
+    The returned list contains all Keras layers that can be parsed into the hls4ml internal representation. Support for
+    computation of these layers may vary across hls4ml backends and conversion configuration.
+
+    Returns:
+        list: The names of supported Keras layers.
+    """
+    return list(layer_handlers.keys())
+
+
+def keras_handler(*args):
+    def decorator(function):
+        function.handles = [arg for arg in args]
+        return function
+
+    return decorator
+
+
+def parse_default_keras_layer(keras_layer, input_names):
+    layer = {}
+
+    # Extract name for finding weights and biases
+    layer['name'] = keras_layer['config']['name']
+    layer['class_name'] = keras_layer['class_name']
+    if input_names is not None:
+        layer['inputs'] = input_names
+
+    layer['data_format'] = keras_layer['config'].get('data_format', 'channels_last')
+
+    if 'activation' in keras_layer['config']:
+        layer['activation'] = keras_layer['config']['activation']
+    if 'epsilon' in keras_layer['config']:
+        layer['epsilon'] = keras_layer['config']['epsilon']
+    if 'use_bias' in keras_layer['config']:
+        layer['use_bias'] = keras_layer['config']['use_bias']
+
+    return layer
+
+
+def get_model_arch(config):
+    if 'KerasModel' in config:
+        # Model instance passed in config from API
+        keras_model = config['KerasModel']
+        if isinstance(keras_model, str):
+            from tensorflow.keras.models import load_model
+
+            keras_model = load_model(keras_model)
+        model_arch = json.loads(keras_model.to_json())
+        reader = KerasModelReader(keras_model)
+    elif 'KerasJson' in config:
+        # Extract model architecture from json
+        with open(config['KerasJson']) as json_file:
+            model_arch = json.load(json_file)
+        reader = KerasFileReader(config)
+    elif 'KerasH5' in config:
+        # Model arch and weights are in H5 file (from model.save() function)
+        with h5py.File(config['KerasH5'], mode='r') as h5file:
+            # Load the configuration from h5 using json's decode
+            model_arch = h5file.attrs.get('model_config')
+            if model_arch is None:
+                raise ValueError('No model found in config file.')
+            else:
+                # model_arch is string by default since h5py 3.0.0, keeping this condition for compatibility.
+                if isinstance(model_arch, bytes):
+                    model_arch = model_arch.decode('utf-8')
+                model_arch = json.loads(model_arch)
+        reader = KerasFileReader(config)
+    else:
+        raise ValueError('No model found in config file.')
+
+    return model_arch, reader
+
+
+def parse_keras_model(model_arch, reader):
+    # This is a list of dictionaries to hold all the layer info we need to generate HLS
+    layer_list = []
+
+    # Define layers to skip for conversion to HLS
+    skip_layers = ['Dropout']
+    # Activation layers
+    activation_layers = [
+        'Activation',
+        'LeakyReLU',
+        'ThresholdedReLU',
+        'ELU',
+        'PReLU',
+        'Softmax',
+        'TernaryTanh',
+        'HardActivation',
+        'UnaryLUT',
+        'HGQ>UnaryLUT',
+    ]
+    # Recurrent layers
+    recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU']
+    # All supported layers
+    supported_layers = get_supported_keras_layers() + skip_layers
+
+    # Map inputs of skipped and split (activation) layers
+    inputs_map = {}
+
+    # Loop through layers
+    layer_counter = 0
+
+    input_layers = None
+    output_layers = None
+
+    layer_config = None
+    if model_arch['class_name'] == 'Sequential':
+        print('Interpreting Sequential')
+        layer_config = model_arch['config']
+        if 'layers' in layer_config:  # Newer Keras versions have 'layers' in 'config' key
+            layer_config = layer_config['layers']
+        # Sequential doesn't have InputLayer in TF < 2.3 (Keras 2.4.0)
+        if layer_config[0]['class_name'] != 'InputLayer':
+            input_layer = {}
+            input_layer['name'] = 'input1'
+            input_layer['class_name'] = 'InputLayer'
+            input_layer['input_shape'] = layer_config[0]['config']['batch_input_shape'][1:]
+            layer_list.append(input_layer)
+            print('Input shape:', input_layer['input_shape'])
+    elif model_arch['class_name'] in ['Model', 'Functional']:  # TF >= 2.3 calls it 'Functional' API
+        print('Interpreting Model')
+        layer_config = model_arch['config']['layers']
+        input_layers = [inp[0] for inp in model_arch['config']['input_layers']]
+        output_layers = [out[0] for out in model_arch['config']['output_layers']]
+
+    # Get input shape and check for unsupported layer type
+    for keras_layer in layer_config:
+        if keras_layer['class_name'] not in supported_layers:
+            raise Exception('ERROR: Unsupported layer type: {}'.format(keras_layer['class_name']))
+
+    output_shapes = {}
+    output_shape = None
+
+    print('Topology:')
+    for keras_layer in layer_config:
+        if 'batch_input_shape' in keras_layer['config']:
+            if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
+                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
+            else:
+                input_shapes = [keras_layer['config']['batch_input_shape']]
+        else:
+            if 'inbound_nodes' in keras_layer:
+                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
+            else:
+                # Sequential model, so output_shape from the previous layer is still valid
+                input_shapes = [output_shape]
+
+        keras_class = keras_layer['class_name']
+
+        if keras_class in skip_layers:
+            if 'inbound_nodes' in keras_layer:
+                name = keras_layer['config']['name']
+                # Currently supported skipped layers have only one input
+                parent_input = keras_layer['inbound_nodes'][0][0][0]
+                # Skipped layers can follow each other (e.g., Dropout -> Flatten)
+                inputs_map[name] = inputs_map.get(parent_input, parent_input)
+
+            output_shapes[keras_layer['config']['name']] = input_shapes[0]
+
+            continue
+
+        if keras_class in supported_layers:
+            layer_counter = layer_counter + 1
+
+        # Extract inbound nodes
+        if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
+            input_names = [inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0]]
+            if keras_layer['inbound_nodes'][0][0][
+                -1
+            ]:  # multi_head_attention  has inbound: [[['input_3', 0, 0, {'value': ['dense_3', 0, 0]}]]]
+                inputname2 = list(keras_layer['inbound_nodes'][0][0][-1].values())
+                input_names += [inp[0] for inp in inputname2]
+        else:
+            input_names = None
+
+        layer, output_shape = layer_handlers[keras_class](keras_layer, input_names, input_shapes, reader)
+
+        print(
+            'Layer name: {}, layer type: {}, input shapes: {}, output shape: {}'.format(
+                layer['name'], layer['class_name'], input_shapes, output_shape
+            )
+        )
+        layer_list.append(layer)
+        if 'activation' in layer and layer['class_name'] not in activation_layers + recurrent_layers:  # + qkeras_layers:
+            act_layer = {}
+            # Workaround for QKeras activations passed as an argument
+            if isinstance(layer['activation'], dict):
+                act_details = layer['activation']
+                act_layer['class_name'] = 'QActivation'
+                act_layer['config'] = {
+                    'name': layer['name'] + '_' + act_details['class_name'],
+                    'activation': act_details,
+                }
+                act_layer, output_shape = layer_handlers['QActivation'](act_layer, None, [output_shape], reader)
+            else:
+                act_layer['name'] = layer['name'] + '_' + layer['activation']
+                act_layer['activation'] = layer['activation']
+                if 'activ_param' in layer:
+                    act_layer['activ_param'] = layer['activ_param']
+                    act_layer['class_name'] = layer['activation']
+                elif layer['activation'] == 'softmax':
+                    act_layer['class_name'] = 'Softmax'
+                    act_layer['axis'] = -1
+                else:
+                    act_layer['class_name'] = 'Activation'
+            inputs_map[layer['name']] = act_layer['name']
+            if output_layers is not None and layer['name'] in output_layers:
+                output_layers = [act_layer['name'] if name == layer['name'] else name for name in output_layers]
+            output_shapes[act_layer['name']] = output_shape
+            layer_list.append(act_layer)
+
+        assert output_shape is not None
+
+        output_shapes[layer['name']] = output_shape
+
+    return layer_list, input_layers, output_layers, output_shapes
+
+
+def keras_to_hls(config):
+    model_arch, reader = get_model_arch(config)
+    layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
+    print('Creating HLS model')
+    hls_model = ModelGraph(config, reader, layer_list, input_layers, output_layers)
+    return hls_model
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 84a83de23e..2cbb4916cd 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -1,700 +1,713 @@
-import json
-import os
-import shutil
-import uuid
-from collections import defaultdict
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas
-import seaborn as sb
-
-from hls4ml.model.graph import ModelGraph
-from hls4ml.model.layers import GRU, LSTM, SeparableConv1D, SeparableConv2D
-
-try:
-    import qkeras
-    from tensorflow import keras
-
-    __tf_profiling_enabled__ = True
-except ImportError:
-    __tf_profiling_enabled__ = False
-
-try:
-    import torch
-
-    __torch_profiling_enabled__ = True
-except ImportError:
-    __torch_profiling_enabled__ = False
-
-
-def get_unoptimized_hlsmodel(model):
-    from hls4ml.converters import convert_from_config
-
-    new_config = model.config.config.copy()
-    new_config['HLSConfig'] = json.loads(json.dumps(new_config['HLSConfig']))
-
-    new_output_dir = uuid.uuid4().hex
-
-    while os.path.exists(new_output_dir):
-        new_output_dir = uuid.uuid4().hex
-
-    if 'SkipOptimizers' in new_config['HLSConfig']:
-        del new_config['HLSConfig']['SkipOptimizers']
-
-    new_config['HLSConfig']['Optimizers'] = []
-    new_config['OutputDir'] = new_output_dir
-
-    return convert_from_config(new_config), new_output_dir
-
-
-def array_to_summary(x, fmt='boxplot'):
-    if fmt == 'boxplot':
-        y = {'med': np.median(x), 'q1': np.percentile(x, 25), 'q3': np.percentile(x, 75), 'whislo': min(x), 'whishi': max(x)}
-    elif fmt == 'histogram':
-        # Power of 2 bins covering data range
-        high = np.ceil(np.log2(max(x))) + 1
-        low = np.floor(np.log2(min(x))) - 1
-        bits = np.arange(low, high, 1)
-        bins = 2**bits
-        h, b = np.histogram(x, bins=bins)
-        h = h * 1.0 / float(sum(h))  # normalize
-        y = {'h': h, 'b': np.log2(b)}
-    return y
-
-
-def boxplot(data, fmt='longform'):
-    if fmt == 'longform':
-        f = plt.figure()  # figsize=(3, 3))
-        hue = 'layer' if 'layer' in data.keys() else None
-        vp = sb.boxplot(x='x', y='weight', hue=hue, data=data[data['x'] > 0], showfliers=False)
-        vp.set_yticklabels(vp.get_yticklabels(), rotation=45, ha='right')
-        if hue is not None:
-            vp.get_legend().remove()
-        vp.set_xscale('log', base=2)
-        return f
-    elif fmt == 'summary':
-        from matplotlib.patches import Rectangle
-
-        medianprops = dict(linestyle='-', color='k')
-        f, ax = plt.subplots(1, 1)
-        data.reverse()
-        colors = sb.color_palette("Blues", len(data))
-        bp = ax.bxp(data, showfliers=False, vert=False, medianprops=medianprops)
-        # add colored boxes
-        for line, color in zip(bp['boxes'], colors):
-            x = line.get_xdata()
-            xl, xh = min(x), max(x)
-            y = line.get_ydata()
-            yl, yh = min(y), max(y)
-            rect = Rectangle((xl, yl), (xh - xl), (yh - yl), fill=True, color=color)
-            ax.add_patch(rect)
-        ax.set_yticklabels([d['weight'] for d in data])
-        ax.set_xscale('log', base=2)
-        plt.xlabel('x')
-        return f
-    else:
-        return None
-
-
-def histogram(data, fmt='longform'):
-    f = plt.figure()
-    from matplotlib.ticker import MaxNLocator
-
-    n = len(data) if fmt == 'summary' else len(data['weight'].unique())
-    colors = sb.color_palette("husl", n)
-    if fmt == 'longform':
-        for i, weight in enumerate(data['weight'].unique()):
-            y = array_to_summary(data[data['weight'] == weight]['x'], fmt='histogram')
-            plt.bar(y['b'][:-1], y['h'], width=1, fill=False, label=weight, edgecolor=colors[i])
-    elif fmt == 'summary':
-        for i, weight in enumerate(data):
-            plt.bar(weight['b'][:-1], weight['h'], width=1, fill=False, label=weight['weight'], edgecolor=colors[i])
-
-    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
-    plt.xlabel('log2(x)')
-    plt.ylabel('frequency')
-    plt.legend()
-    return f
-
-
-plots = {'boxplot': boxplot, 'histogram': histogram}
-
-
-def types_boxplot(data, fmt='longform'):
-    from matplotlib.patches import PathPatch, Rectangle
-
-    ax = plt.gca()
-    _ = plt.gcf()
-    # Scale the data
-    data['low'] = 2.0 ** data['low']
-    data['high'] = 2.0 ** data['high']
-
-    # Plot the custom precisions
-    ticks = np.array([tick.get_text() for tick in plt.yticks()[1]])
-    # Get the coordinates of the boxes to place the markers
-    if fmt == 'longform':
-        # seaborn adjusts the box positions slightly in groups
-        boxes = [c.get_extents().inverse_transformed(ax.transData) for c in ax.get_children() if isinstance(c, PathPatch)]
-        ys = [(box.y0 + box.y1) / 2 for box in boxes]
-        ys = [(y, y) for y in ys]
-    elif fmt == 'summary':
-        ys = [(y, y) for y in plt.yticks()[0]]
-    for _irow, row in data[data['layer'] != 'model'].iterrows():
-        if row['layer'] in ticks:
-            iy = np.argwhere(ticks == row['layer'])[0][0]  # Determine which layer in the plot
-            rectangle = Rectangle(
-                (row['low'], ys[iy][0] - 0.4), row['high'] - row['low'], 0.8, fill=True, color='grey', alpha=0.2
-            )
-            ax.add_patch(rectangle)
-
-
-def types_histogram(data, fmt='longform'):
-    ax = plt.gca()
-    layers = np.array(ax.get_legend_handles_labels()[1])
-    colors = sb.color_palette("husl", len(layers))
-    ylim = ax.get_ylim()
-    for _irow, row in data[data['layer'] != 'model'].iterrows():
-        if row['layer'] in layers:
-            col = colors[np.argwhere(layers == row['layer'])[0][0]]
-            plt.plot((row['low'], row['low']), ylim, '--', color=col)
-            plt.plot((row['high'], row['high']), ylim, '--', color=col)
-
-
-types_plots = {'boxplot': types_boxplot, 'histogram': types_histogram}
-
-
-def ap_fixed_WIFS(dtype):
-    from hls4ml.backends import VivadoBackend
-
-    dtype = VivadoBackend.convert_precision_string(dtype)
-    W, I, F, S = dtype.width, dtype.integer, dtype.fractional, dtype.signed
-    return W, I, F, S
-
-
-def types_hlsmodel(model):
-    data = {'layer': [], 'low': [], 'high': []}
-    # Plot the default precision
-    default_precision = model.config.model_precision['default']
-    W, I, F, S = ap_fixed_WIFS(default_precision)
-    data['layer'].append('model')
-    data['low'].append(-F)
-    data['high'].append(I - 1 if S else I)
-
-    for layer in model.get_layers():
-        if isinstance(layer, GRU) or isinstance(layer, LSTM):
-            suffix = ['w', 'rw', 'b', 'rb']
-        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
-            suffix = ['dw', 'pw', 'db', 'pb']
-        else:
-            suffix = ['w', 'b']
-        for iw, weight in enumerate(layer.get_weights()):
-            wname = f'{layer.name}/{suffix[iw]}'
-            T = weight.type
-            if T.name != 'model':
-                W, I, F, S = ap_fixed_WIFS(T.precision)
-                data['layer'].append(wname)
-                data['low'].append(-F)
-                data['high'].append(I - 1 if S else I)
-    data = pandas.DataFrame(data)
-    return data
-
-
-def activation_types_hlsmodel(model):
-    data = {'layer': [], 'low': [], 'high': []}
-    # Get the default precision
-    default_precision = model.config.model_precision['default']
-    W, I, F, S = ap_fixed_WIFS(default_precision)
-    data['layer'].append('model')
-    data['low'].append(-F)
-    data['high'].append(I - 1 if S else I)
-    for layer in model.get_layers():
-        T = layer.get_output_variable().type.precision
-        W, I, F, S = ap_fixed_WIFS(T)
-        data['layer'].append(layer.name)
-        data['low'].append(-F)
-        data['high'].append(I - 1 if S else I)
-    data = pandas.DataFrame(data)
-    return data
-
-
-def weights_hlsmodel(model, fmt='longform', plot='boxplot'):
-    if fmt == 'longform':
-        data = {'x': [], 'layer': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-
-    for layer in model.get_layers():
-        if isinstance(layer, GRU) or isinstance(layer, LSTM):
-            suffix = ['w', 'rw', 'b', 'rb']
-        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
-            suffix = ['dw', 'pw', 'db', 'pb']
-        else:
-            suffix = ['w', 'b']
-        name = layer.name
-        for iw, weight in enumerate(layer.get_weights()):
-            label = f'{name}/{suffix[iw]}'
-            w = weight.data.flatten()
-            w = abs(w[w != 0])
-            n = len(w)
-            if n == 0:
-                print(f'Weights for {name} are only zeros, ignoring.')
-                break
-            if fmt == 'longform':
-                data['x'].extend(w.tolist())
-                data['layer'].extend([name] * len(w))
-                data['weight'].extend([label] * len(w))
-            elif fmt == 'summary':
-                data.append(array_to_summary(w, fmt=plot))
-                data[-1]['layer'] = name
-                data[-1]['weight'] = label
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def _keras_batchnorm(layer):
-    weights = layer.get_weights()
-    epsilon = layer.epsilon
-
-    gamma = weights[0]
-    beta = weights[1]
-    mean = weights[2]
-    var = weights[3]
-
-    scale = gamma / np.sqrt(var + epsilon)
-    bias = beta - gamma * mean / np.sqrt(var + epsilon)
-
-    return [scale, bias], ['s', 'b']
-
-
-def _keras_layer(layer):
-    return layer.get_weights(), ['w', 'b']
-
-
-def _keras_lstm(layer):
-    return layer.get_weights(), ['w', 'u', 'b']
-
-
-keras_process_layer_map = defaultdict(
-    lambda: _keras_layer,
-    {
-        'BatchNormalization': _keras_batchnorm,
-        'QBatchNormalization': _keras_batchnorm,
-        'LSTM': _keras_lstm,
-        'QLSTM': _keras_lstm,
-    },
-)
-
-
-def activations_hlsmodel(model, X, fmt='summary', plot='boxplot'):
-    if fmt == 'longform':
-        raise NotImplementedError
-    elif fmt == 'summary':
-        data = []
-
-    _, trace = model.trace(np.ascontiguousarray(X))
-
-    if len(trace) == 0:
-        raise RuntimeError("ModelGraph must have tracing on for at least 1 layer (this can be set in its config)")
-
-    for layer in trace.keys():
-        print(f"   {layer}")
-
-        if fmt == 'summary':
-            y = trace[layer].flatten()
-            y = abs(y[y != 0])
-
-            if len(y) == 0:
-                print(f'Activations for {layer} are only zeros, ignoring.')
-                continue
-
-            data.append(array_to_summary(y, fmt=plot))
-            data[-1]['weight'] = layer
-
-    return data
-
-
-def weights_keras(model, fmt='longform', plot='boxplot'):
-    if fmt == 'longform':
-        data = {'x': [], 'layer': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-    for layer in model.layers:
-        name = layer.name
-        weights, suffix = keras_process_layer_map[type(layer).__name__](layer)
-
-        for i, w in enumerate(weights):
-            label = f'{name}/{suffix[i]}'
-            w = w.flatten()
-            w = abs(w[w != 0])
-            n = len(w)
-            if n == 0:
-                print(f'Weights for {name} are only zeros, ignoring.')
-                break
-            if fmt == 'longform':
-                data['x'].extend(w.tolist())
-                data['layer'].extend([name] * n)
-                data['weight'].extend([label] * n)
-            elif fmt == 'summary':
-                data.append(array_to_summary(w, fmt=plot))
-                data[-1]['layer'] = name
-                data[-1]['weight'] = label
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def activations_keras(model, X, fmt='longform', plot='boxplot'):
-    # test layer by layer on data
-    if fmt == 'longform':
-        # return long form pandas dataframe for
-        # seaborn boxplot
-        data = {'x': [], 'weight': []}
-    elif fmt == 'summary':
-        # return summary statistics for matplotlib.axes.Axes.bxp
-        # or histogram bin edges and heights
-        data = []
-    outputs = _get_outputs(
-        [layer for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], X, model.input
-    )
-    outputs = dict(zip([layer.name for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], outputs))
-    for layer_name, y in outputs.items():
-        print(f"   {layer_name}")
-        y = y.flatten()
-        y = abs(y[y != 0])
-        if len(y) == 0:
-            print(f'Activations for {layer_name} are only zeros, ignoring.')
-            continue
-        if fmt == 'longform':
-            data['x'].extend(y.tolist())
-            data['weight'].extend([layer_name for i in range(len(y))])
-        elif fmt == 'summary':
-            data.append(array_to_summary(y, fmt=plot))
-            data[-1]['weight'] = layer_name
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def weights_torch(model, fmt='longform', plot='boxplot'):
-    suffix = ['w', 'b']
-    if fmt == 'longform':
-        data = {'x': [], 'layer': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-    for layer in model.children():
-        if isinstance(layer, torch.nn.Linear):
-            name = layer.__class__.__name__
-            weights = list(layer.parameters())
-            for i, w in enumerate(weights):
-                label = f'{name}/{suffix[i]}'
-                w = weights[i].detach().numpy()
-                w = w.flatten()
-                w = abs(w[w != 0])
-                n = len(w)
-                if n == 0:
-                    print(f'Weights for {name} are only zeros, ignoring.')
-                    break
-                if fmt == 'longform':
-                    data['x'].extend(w.tolist())
-                    data['layer'].extend([name] * n)
-                    data['weight'].extend([label] * n)
-                elif fmt == 'summary':
-                    data.append(array_to_summary(w, fmt=plot))
-                    data[-1]['layer'] = name
-                    data[-1]['weight'] = label
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def activations_torch(model, X, fmt='longform', plot='boxplot'):
-    X = torch.Tensor(X)
-    if fmt == 'longform':
-        data = {'x': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-
-    partial_model = torch.nn.Sequential
-    layers = []
-    for layer in model.children():
-        lname = layer.__class__.__name__
-        layers.append(layer)
-        pm = partial_model(*layers)
-        print(f"   {lname}")
-        y = pm(X).flatten().detach().numpy()
-        y = abs(y[y != 0])
-        if len(y) == 0:
-            print(f'Activations for {lname} are only zeros, ignoring.')
-            continue
-        if fmt == 'longform':
-            data['x'].extend(y.tolist())
-            data['weight'].extend([lname for _ in range(len(y))])
-        elif fmt == 'summary':
-            data.append(array_to_summary(y, fmt=plot))
-            data[-1]['weight'] = lname
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def numerical(model=None, hls_model=None, X=None, plot='boxplot'):
-    """Perform numerical profiling of a model.
-
-    Args:
-        model (optional): Keras of PyTorch model. Defaults to None.
-        hls_model (ModelGraph, optional): The ModelGraph to profile. Defaults to None.
-        X (ndarray, optional): Test data on which to evaluate the model to profile activations.
-            Must be formatted suitably for the ``model.predict(X)``. Defaults to None.
-        plot (str, optional): The type of plot to produce. Options are: 'boxplot' (default), 'violinplot', 'histogram',
-            'FacetGrid'. Defaults to 'boxplot'.
-
-    Returns:
-        tuple: The quadruple of produced figures. First weights and biases
-            for the pre- and post-optimization models respectively,
-            then activations for the pre- and post-optimization models
-            respectively. (Optimizations are applied to an ModelGraph by hls4ml,
-            a post-optimization ModelGraph is a final model).
-    """
-    wp, wph, ap, aph = None, None, None, None
-
-    hls_model_present = hls_model is not None and isinstance(hls_model, ModelGraph)
-    model_present = model is not None
-
-    if hls_model_present:
-        before = " (before optimization)"
-        after = " (final / after optimization)"
-        hls_model_unoptimized, tmp_output_dir = get_unoptimized_hlsmodel(hls_model)
-    else:
-        before = ""
-        after = ""
-        hls_model_unoptimized, tmp_output_dir = None, None
-
-    print("Profiling weights" + before)
-    data = None
-
-    if hls_model_present:
-        data = weights_hlsmodel(hls_model_unoptimized, fmt='summary', plot=plot)
-    elif model_present:
-        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
-            data = weights_keras(model, fmt='summary', plot=plot)
-        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
-            data = weights_torch(model, fmt='summary', plot=plot)
-
-    if data is None:
-        print("Only keras, PyTorch (Sequential) and ModelGraph models " + "can currently be profiled")
-
-        if hls_model_present and os.path.exists(tmp_output_dir):
-            shutil.rmtree(tmp_output_dir)
-
-        return wp, wph, ap, aph
-
-    wp = plots[plot](data, fmt='summary')  # weight plot
-
-    if hls_model_present and plot in types_plots:
-        t_data = types_hlsmodel(hls_model_unoptimized)
-        types_plots[plot](t_data, fmt='summary')
-
-    plt.title("Distribution of (non-zero) weights" + before)
-    plt.tight_layout()
-
-    if hls_model_present:
-        print("Profiling weights" + after)
-
-        data = weights_hlsmodel(hls_model, fmt='summary', plot=plot)
-        wph = plots[plot](data, fmt='summary')  # weight plot
-
-        if plot in types_plots:
-            t_data = types_hlsmodel(hls_model)
-            types_plots[plot](t_data, fmt='summary')
-
-        plt.title("Distribution of (non-zero) weights" + after)
-        plt.tight_layout()
-
-    if X is not None:
-        print("Profiling activations" + before)
-        data = None
-        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
-            data = activations_keras(model, X, fmt='summary', plot=plot)
-        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
-            data = activations_torch(model, X, fmt='summary', plot=plot)
-
-        if data is not None:
-            ap = plots[plot](data, fmt='summary')  # activation plot
-            if hls_model_present and plot in types_plots:
-                t_data = activation_types_hlsmodel(hls_model_unoptimized)
-                types_plots[plot](t_data, fmt='summary')
-            plt.title("Distribution of (non-zero) activations" + before)
-            plt.tight_layout()
-
-        if hls_model_present:
-            print("Profiling activations" + after)
-            data = activations_hlsmodel(hls_model, X, fmt='summary', plot=plot)
-            aph = plots[plot](data, fmt='summary')
-
-            t_data = activation_types_hlsmodel(hls_model)
-            types_plots[plot](t_data, fmt='summary')
-
-            plt.title("Distribution of (non-zero) activations (final / after optimization)")
-            plt.tight_layout()
-
-    if hls_model_present and os.path.exists(tmp_output_dir):
-        shutil.rmtree(tmp_output_dir)
-
-    return wp, wph, ap, aph
-
-
-#########
-# COMPARE OUTPUT IMPLEMENTATION
-#########
-def _is_ignored_layer(layer):
-    """Some layers need to be ingored during inference"""
-    if isinstance(layer, (keras.layers.InputLayer, keras.layers.Dropout)):
-        return True
-    return False
-
-
-def _get_outputs(layers, X, model_input):
-    """Get outputs of intermediate layers"""
-    partial_models = keras.models.Model(inputs=model_input, outputs=[layer.output for layer in layers])
-    y = partial_models.predict(X)
-    return y
-
-
-def get_ymodel_keras(keras_model, X):
-    """Calculate each layer's ouput and put them into a dictionary.
-
-    Args:
-        keras_model (_type_): A keras Model
-        X (ndarray): Test data on which to evaluate the model to profile activations.
-            Must be formatted suitably for the ``model.predict(X)``.
-
-    Returns:
-        dict: A dictionary in the form {"layer_name": ouput array of layer}.
-    """
-    ymodel = {}
-    traced_layers = []
-    layer_names = []
-    for layer in keras_model.layers:
-        if _is_ignored_layer(layer):
-            continue
-        # If the layer has activation integrated then separate them
-        # Note that if the layer is a standalone activation layer then skip this
-        name = layer.name
-        if (
-            hasattr(layer, 'activation')
-            and layer.activation is not None
-            and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
-            and layer.activation.__name__ != 'linear'
-        ):
-            tmp_activation = layer.activation
-            layer.activation = None
-            ymodel.update({layer.name: _get_outputs([layer], X, keras_model.input)})
-            layer.activation = tmp_activation
-            name = layer.name + f"_{tmp_activation.__name__}"
-        traced_layers.append(layer)
-        layer_names.append(name)
-    outputs = _get_outputs(traced_layers, X, keras_model.input)
-    for name, output in zip(layer_names, outputs):
-        ymodel[name] = output
-    print("Done taking outputs for Keras model.")
-    return ymodel
-
-
-def _norm_diff(ymodel, ysim):
-    """Calculate the square root of the sum of the squares of the differences"""
-    diff = {}
-
-    for key in list(ysim.keys()):
-        diff[key] = np.linalg.norm(ysim[key] - ymodel[key])
-
-    # ---Bar Plot---
-    f, ax = plt.subplots()
-    plt.bar(list(diff.keys()), list(diff.values()))
-    plt.title("layer-by-layer output differences")
-    ax.set_ylabel('Norm of difference vector')
-    plt.xticks(rotation=90)
-    plt.tight_layout()
-    return f
-
-
-def _dist_diff(ymodel, ysim):
-    """
-    Calculate the normalized distribution of the differences of the elements
-    of the output vectors.
-    If difference >= original value then the normalized difference will be set to 1,
-    meaning "very difference".
-    If difference < original value then the normalized difference would be difference/original.
-    """
-
-    diff = {}
-
-    for key in list(ysim.keys()):
-        flattened_ysim = ysim[key].flatten()
-        flattened_ymodel = np.array(ymodel[key]).flatten()
-
-        diff[key] = np.absolute(flattened_ymodel - flattened_ysim) / np.linalg.norm(flattened_ymodel - flattened_ysim)
-        diff_vector = np.absolute(flattened_ymodel - flattened_ysim)
-        abs_ymodel = np.absolute(flattened_ymodel)
-
-        normalized_diff = np.zeros(diff_vector.shape)
-        normalized_diff[(diff_vector >= abs_ymodel) & (abs_ymodel > 0) & (diff_vector > 0)] = 1
-
-        # Fill out the rest
-        index = diff_vector < abs_ymodel
-        normalized_diff[index] = diff_vector[index] / abs_ymodel[index]
-
-        diff[key] = normalized_diff
-
-    # ---Box Plot---
-    f, ax = plt.subplots()
-    pos = np.array(range(len(list(diff.values())))) + 1
-    ax.boxplot(list(diff.values()), sym='k+', positions=pos)
-
-    # --formatting
-    plt.title("Layer-by-layer distribution of output differences")
-    ax.set_xticklabels(list(diff.keys()))
-    ax.set_ylabel('Normalized difference')
-    ax.set_ylabel('Percent difference.')
-    plt.xticks(rotation=90)
-    plt.tight_layout()
-
-    return f
-
-
-def compare(keras_model, hls_model, X, plot_type="dist_diff"):
-    """Compare each layer's output in keras and hls model. Note that the hls_model should not be compiled before using this.
-
-    Args:
-        keras_model: Original keras model.
-        hls_model (ModelGraph): Converted ModelGraph, with "Trace:True" in the configuration file.
-        X (ndarray): Input tensor for the model.
-        plot_type (str, optional): Different methods to visualize the y_model and y_sim differences.
-            Possible options include:
-            - 'norm_diff':: square root of the sum of the squares of the differences between each output vectors.
-            - 'dist_diff':: The normalized distribution of the differences of the elements between two output vectors.
-            Defaults to "dist_diff".
-
-    Returns:
-        matplotlib figure: Plot object of the histogram depicting the difference in each layer's output.
-    """
-
-    # Take in output from both models
-    # Note that each y is a dictionary with structure {"layer_name": flattened ouput array}
-    ymodel = get_ymodel_keras(keras_model, X)
-    _, ysim = hls_model.trace(X)
-
-    print("Plotting difference...")
-    f = plt.figure()
-    if plot_type == "norm_diff":
-        f = _norm_diff(ymodel, ysim)
-    elif plot_type == "dist_diff":
-        f = _dist_diff(ymodel, ysim)
-
-    return f
+import json
+import os
+import shutil
+import uuid
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas
+import seaborn as sb
+
+from hls4ml.model.graph import ModelGraph
+from hls4ml.model.layers import GRU, LSTM, SeparableConv1D, SeparableConv2D
+
+try:
+    import qkeras
+    from tensorflow import keras
+
+    __tf_profiling_enabled__ = True
+except ImportError:
+    __tf_profiling_enabled__ = False
+
+try:
+    import torch
+
+    __torch_profiling_enabled__ = True
+except ImportError:
+    __torch_profiling_enabled__ = False
+
+
+def get_unoptimized_hlsmodel(model):
+    from hls4ml.converters import convert_from_config
+
+    new_config = model.config.config.copy()
+    new_config['HLSConfig'] = json.loads(json.dumps(new_config['HLSConfig']))
+
+    new_output_dir = uuid.uuid4().hex
+
+    while os.path.exists(new_output_dir):
+        new_output_dir = uuid.uuid4().hex
+
+    if 'SkipOptimizers' in new_config['HLSConfig']:
+        del new_config['HLSConfig']['SkipOptimizers']
+
+    new_config['HLSConfig']['Optimizers'] = []
+    new_config['OutputDir'] = new_output_dir
+
+    return convert_from_config(new_config), new_output_dir
+
+
+def array_to_summary(x, fmt='boxplot'):
+    if fmt == 'boxplot':
+        y = {'med': np.median(x), 'q1': np.percentile(x, 25), 'q3': np.percentile(x, 75), 'whislo': min(x), 'whishi': max(x)}
+    elif fmt == 'histogram':
+        # Power of 2 bins covering data range
+        high = np.ceil(np.log2(max(x))) + 1
+        low = np.floor(np.log2(min(x))) - 1
+        bits = np.arange(low, high, 1)
+        bins = 2**bits
+        h, b = np.histogram(x, bins=bins)
+        h = h * 1.0 / float(sum(h))  # normalize
+        y = {'h': h, 'b': np.log2(b)}
+    return y
+
+
+def boxplot(data, fmt='longform'):
+    if fmt == 'longform':
+        f = plt.figure()  # figsize=(3, 3))
+        hue = 'layer' if 'layer' in data.keys() else None
+        vp = sb.boxplot(x='x', y='weight', hue=hue, data=data[data['x'] > 0], showfliers=False)
+        vp.set_yticklabels(vp.get_yticklabels(), rotation=45, ha='right')
+        if hue is not None:
+            vp.get_legend().remove()
+        vp.set_xscale('log', base=2)
+        return f
+    elif fmt == 'summary':
+        from matplotlib.patches import Rectangle
+
+        medianprops = dict(linestyle='-', color='k')
+        f, ax = plt.subplots(1, 1)
+        data.reverse()
+        colors = sb.color_palette("Blues", len(data))
+        bp = ax.bxp(data, showfliers=False, vert=False, medianprops=medianprops)
+        # add colored boxes
+        for line, color in zip(bp['boxes'], colors):
+            x = line.get_xdata()
+            xl, xh = min(x), max(x)
+            y = line.get_ydata()
+            yl, yh = min(y), max(y)
+            rect = Rectangle((xl, yl), (xh - xl), (yh - yl), fill=True, color=color)
+            ax.add_patch(rect)
+        ax.set_yticklabels([d['weight'] for d in data])
+        ax.set_xscale('log', base=2)
+        plt.xlabel('x')
+        return f
+    else:
+        return None
+
+
+def histogram(data, fmt='longform'):
+    f = plt.figure()
+    from matplotlib.ticker import MaxNLocator
+
+    n = len(data) if fmt == 'summary' else len(data['weight'].unique())
+    colors = sb.color_palette("husl", n)
+    if fmt == 'longform':
+        for i, weight in enumerate(data['weight'].unique()):
+            y = array_to_summary(data[data['weight'] == weight]['x'], fmt='histogram')
+            plt.bar(y['b'][:-1], y['h'], width=1, fill=False, label=weight, edgecolor=colors[i])
+    elif fmt == 'summary':
+        for i, weight in enumerate(data):
+            plt.bar(weight['b'][:-1], weight['h'], width=1, fill=False, label=weight['weight'], edgecolor=colors[i])
+
+    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
+    plt.xlabel('log2(x)')
+    plt.ylabel('frequency')
+    plt.legend()
+    return f
+
+
+plots = {'boxplot': boxplot, 'histogram': histogram}
+
+
+def types_boxplot(data, fmt='longform'):
+    from matplotlib.patches import PathPatch, Rectangle
+
+    ax = plt.gca()
+    _ = plt.gcf()
+    # Scale the data
+    data['low'] = 2.0 ** data['low']
+    data['high'] = 2.0 ** data['high']
+
+    # Plot the custom precisions
+    ticks = np.array([tick.get_text() for tick in plt.yticks()[1]])
+    # Get the coordinates of the boxes to place the markers
+    if fmt == 'longform':
+        # seaborn adjusts the box positions slightly in groups
+        boxes = [c.get_extents().inverse_transformed(ax.transData) for c in ax.get_children() if isinstance(c, PathPatch)]
+        ys = [(box.y0 + box.y1) / 2 for box in boxes]
+        ys = [(y, y) for y in ys]
+    elif fmt == 'summary':
+        ys = [(y, y) for y in plt.yticks()[0]]
+    for _irow, row in data[data['layer'] != 'model'].iterrows():
+        if row['layer'] in ticks:
+            iy = np.argwhere(ticks == row['layer'])[0][0]  # Determine which layer in the plot
+            rectangle = Rectangle(
+                (row['low'], ys[iy][0] - 0.4), row['high'] - row['low'], 0.8, fill=True, color='grey', alpha=0.2
+            )
+            ax.add_patch(rectangle)
+
+
+def types_histogram(data, fmt='longform'):
+    ax = plt.gca()
+    layers = np.array(ax.get_legend_handles_labels()[1])
+    colors = sb.color_palette("husl", len(layers))
+    ylim = ax.get_ylim()
+    for _irow, row in data[data['layer'] != 'model'].iterrows():
+        if row['layer'] in layers:
+            col = colors[np.argwhere(layers == row['layer'])[0][0]]
+            plt.plot((row['low'], row['low']), ylim, '--', color=col)
+            plt.plot((row['high'], row['high']), ylim, '--', color=col)
+
+
+types_plots = {'boxplot': types_boxplot, 'histogram': types_histogram}
+
+
+def ap_fixed_WIFS(dtype):
+    from hls4ml.backends import VivadoBackend
+
+    dtype = VivadoBackend.convert_precision_string(dtype)
+    W, I, F, S = dtype.width, dtype.integer, dtype.fractional, dtype.signed
+    return W, I, F, S
+
+
+def types_hlsmodel(model):
+    data = {'layer': [], 'low': [], 'high': []}
+    # Plot the default precision
+    default_precision = model.config.model_precision['default']
+    W, I, F, S = ap_fixed_WIFS(default_precision)
+    data['layer'].append('model')
+    data['low'].append(-F)
+    data['high'].append(I - 1 if S else I)
+
+    for layer in model.get_layers():
+        if isinstance(layer, GRU) or isinstance(layer, LSTM):
+            suffix = ['w', 'rw', 'b', 'rb']
+        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
+            suffix = ['dw', 'pw', 'db', 'pb']
+        else:
+            suffix = ['w', 'b']
+        for iw, weight in enumerate(layer.get_weights()):
+            wname = f'{layer.name}/{suffix[iw]}'
+            T = weight.type
+            if T.name != 'model':
+                W, I, F, S = ap_fixed_WIFS(T.precision)
+                data['layer'].append(wname)
+                data['low'].append(-F)
+                data['high'].append(I - 1 if S else I)
+    data = pandas.DataFrame(data)
+    return data
+
+
+def activation_types_hlsmodel(model):
+    data = {'layer': [], 'low': [], 'high': []}
+    # Get the default precision
+    default_precision = model.config.model_precision['default']
+    W, I, F, S = ap_fixed_WIFS(default_precision)
+    data['layer'].append('model')
+    data['low'].append(-F)
+    data['high'].append(I - 1 if S else I)
+    for layer in model.get_layers():
+        T = layer.get_output_variable().type.precision
+        W, I, F, S = ap_fixed_WIFS(T)
+        data['layer'].append(layer.name)
+        data['low'].append(-F)
+        data['high'].append(I - 1 if S else I)
+    data = pandas.DataFrame(data)
+    return data
+
+
+def weights_hlsmodel(model, fmt='longform', plot='boxplot'):
+    if fmt == 'longform':
+        data = {'x': [], 'layer': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+
+    for layer in model.get_layers():
+        if isinstance(layer, GRU) or isinstance(layer, LSTM):
+            suffix = ['w', 'rw', 'b', 'rb']
+        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
+            suffix = ['dw', 'pw', 'db', 'pb']
+        else:
+            suffix = ['w', 'b']
+        name = layer.name
+        # print(name)
+        for iw, weight in enumerate(layer.get_weights()):
+            label = f'{name}/{suffix[iw]}'
+            w = weight.data.flatten()
+            w = abs(w[w != 0])
+            n = len(w)
+            if n == 0:
+                print(f'Weights for {name} are only zeros, ignoring.')
+                break
+            if fmt == 'longform':
+                data['x'].extend(w.tolist())
+                data['layer'].extend([name] * len(w))
+                data['weight'].extend([label] * len(w))
+            elif fmt == 'summary':
+                data.append(array_to_summary(w, fmt=plot))
+                data[-1]['layer'] = name
+                data[-1]['weight'] = label
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def _keras_batchnorm(layer):
+    weights = layer.get_weights()
+    epsilon = layer.epsilon
+
+    gamma = weights[0]
+    beta = weights[1]
+    mean = weights[2]
+    var = weights[3]
+
+    scale = gamma / np.sqrt(var + epsilon)
+    bias = beta - gamma * mean / np.sqrt(var + epsilon)
+
+    return [scale, bias], ['s', 'b']
+
+
+def _keras_layer(layer):
+    return layer.get_weights(), ['w', 'b']
+
+
+def _keras_layernorm(layer):
+    weights = layer.get_weights()
+
+    gamma = weights[0]
+    beta = weights[1]
+
+    scale = gamma
+    bias = beta
+
+    return [scale, bias], ['s', 'b']
+
+
+def _keras_lstm(layer):
+    return layer.get_weights(), ['w', 'u', 'b']
+
+
+keras_process_layer_map = defaultdict(
+    lambda: _keras_layer,
+    {
+        'BatchNormalization': _keras_batchnorm,
+        'QBatchNormalization': _keras_batchnorm,
+        'LSTM': _keras_lstm,
+        'QLSTM': _keras_lstm,
+    },
+)
+
+
+def activations_hlsmodel(model, X, fmt='summary', plot='boxplot'):
+    if fmt == 'longform':
+        raise NotImplementedError
+    elif fmt == 'summary':
+        data = []
+
+    _, trace = model.trace(np.ascontiguousarray(X))
+
+    if len(trace) == 0:
+        raise RuntimeError("ModelGraph must have tracing on for at least 1 layer (this can be set in its config)")
+
+    for layer in trace.keys():
+        print(f"   {layer}")
+
+        if fmt == 'summary':
+            y = trace[layer].flatten()
+            y = abs(y[y != 0])
+
+            if len(y) == 0:
+                print(f'Activations for {layer} are only zeros, ignoring.')
+                continue
+
+            data.append(array_to_summary(y, fmt=plot))
+            data[-1]['weight'] = layer
+
+    return data
+
+
+def weights_keras(model, fmt='longform', plot='boxplot'):
+    if fmt == 'longform':
+        data = {'x': [], 'layer': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+    for layer in model.layers:
+        name = layer.name
+        weights, suffix = keras_process_layer_map[type(layer).__name__](layer)
+
+        for i, w in enumerate(weights):
+            label = f'{name}/{suffix[i]}'
+            w = w.flatten()
+            w = abs(w[w != 0])
+            n = len(w)
+            if n == 0:
+                print(f'Weights for {name} are only zeros, ignoring.')
+                break
+            if fmt == 'longform':
+                data['x'].extend(w.tolist())
+                data['layer'].extend([name] * n)
+                data['weight'].extend([label] * n)
+            elif fmt == 'summary':
+                data.append(array_to_summary(w, fmt=plot))
+                data[-1]['layer'] = name
+                data[-1]['weight'] = label
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def activations_keras(model, X, fmt='longform', plot='boxplot'):
+    # test layer by layer on data
+    if fmt == 'longform':
+        # return long form pandas dataframe for
+        # seaborn boxplot
+        data = {'x': [], 'weight': []}
+    elif fmt == 'summary':
+        # return summary statistics for matplotlib.axes.Axes.bxp
+        # or histogram bin edges and heights
+        data = []
+    outputs = _get_outputs(
+        [layer for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], X, model.input
+    )
+    outputs = dict(zip([layer.name for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], outputs))
+    for layer_name, y in outputs.items():
+        print(f"   {layer_name}")
+        y = y.flatten()
+        y = abs(y[y != 0])
+        if len(y) == 0:
+            print(f'Activations for {layer_name} are only zeros, ignoring.')
+            continue
+        if fmt == 'longform':
+            data['x'].extend(y.tolist())
+            data['weight'].extend([layer_name for i in range(len(y))])
+        elif fmt == 'summary':
+            data.append(array_to_summary(y, fmt=plot))
+            data[-1]['weight'] = layer_name
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def weights_torch(model, fmt='longform', plot='boxplot'):
+    suffix = ['w', 'b']
+    if fmt == 'longform':
+        data = {'x': [], 'layer': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+    for layer in model.children():
+        if isinstance(layer, torch.nn.Linear):
+            name = layer.__class__.__name__
+            weights = list(layer.parameters())
+            for i, w in enumerate(weights):
+                label = f'{name}/{suffix[i]}'
+                w = weights[i].detach().numpy()
+                w = w.flatten()
+                w = abs(w[w != 0])
+                n = len(w)
+                if n == 0:
+                    print(f'Weights for {name} are only zeros, ignoring.')
+                    break
+                if fmt == 'longform':
+                    data['x'].extend(w.tolist())
+                    data['layer'].extend([name] * n)
+                    data['weight'].extend([label] * n)
+                elif fmt == 'summary':
+                    data.append(array_to_summary(w, fmt=plot))
+                    data[-1]['layer'] = name
+                    data[-1]['weight'] = label
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def activations_torch(model, X, fmt='longform', plot='boxplot'):
+    X = torch.Tensor(X)
+    if fmt == 'longform':
+        data = {'x': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+
+    partial_model = torch.nn.Sequential
+    layers = []
+    for layer in model.children():
+        lname = layer.__class__.__name__
+        layers.append(layer)
+        pm = partial_model(*layers)
+        print(f"   {lname}")
+        y = pm(X).flatten().detach().numpy()
+        y = abs(y[y != 0])
+        if len(y) == 0:
+            print(f'Activations for {lname} are only zeros, ignoring.')
+            continue
+        if fmt == 'longform':
+            data['x'].extend(y.tolist())
+            data['weight'].extend([lname for _ in range(len(y))])
+        elif fmt == 'summary':
+            data.append(array_to_summary(y, fmt=plot))
+            data[-1]['weight'] = lname
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def numerical(model=None, hls_model=None, X=None, plot='boxplot'):
+    """Perform numerical profiling of a model.
+
+    Args:
+        model (optional): Keras of PyTorch model. Defaults to None.
+        hls_model (ModelGraph, optional): The ModelGraph to profile. Defaults to None.
+        X (ndarray, optional): Test data on which to evaluate the model to profile activations.
+            Must be formatted suitably for the ``model.predict(X)``. Defaults to None.
+        plot (str, optional): The type of plot to produce. Options are: 'boxplot' (default), 'violinplot', 'histogram',
+            'FacetGrid'. Defaults to 'boxplot'.
+
+    Returns:
+        tuple: The quadruple of produced figures. First weights and biases
+            for the pre- and post-optimization models respectively,
+            then activations for the pre- and post-optimization models
+            respectively. (Optimizations are applied to an ModelGraph by hls4ml,
+            a post-optimization ModelGraph is a final model).
+    """
+    wp, wph, ap, aph = None, None, None, None
+
+    hls_model_present = hls_model is not None and isinstance(hls_model, ModelGraph)
+    model_present = model is not None
+
+    if hls_model_present:
+        before = " (before optimization)"
+        after = " (final / after optimization)"
+        hls_model_unoptimized, tmp_output_dir = get_unoptimized_hlsmodel(hls_model)
+    else:
+        before = ""
+        after = ""
+        hls_model_unoptimized, tmp_output_dir = None, None
+
+    print("Profiling weights" + before)
+    data = None
+
+    if hls_model_present:
+        data = weights_hlsmodel(hls_model_unoptimized, fmt='summary', plot=plot)
+    elif model_present:
+        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
+            data = weights_keras(model, fmt='summary', plot=plot)
+        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
+            data = weights_torch(model, fmt='summary', plot=plot)
+
+    if data is None:
+        print("Only keras, PyTorch (Sequential) and ModelGraph models " + "can currently be profiled")
+
+        if hls_model_present and os.path.exists(tmp_output_dir):
+            shutil.rmtree(tmp_output_dir)
+
+        return wp, wph, ap, aph
+
+    wp = plots[plot](data, fmt='summary')  # weight plot
+
+    if hls_model_present and plot in types_plots:
+        t_data = types_hlsmodel(hls_model_unoptimized)
+        types_plots[plot](t_data, fmt='summary')
+
+    plt.title("Distribution of (non-zero) weights" + before)
+    plt.tight_layout()
+
+    if hls_model_present:
+        print("Profiling weights" + after)
+
+        data = weights_hlsmodel(hls_model, fmt='summary', plot=plot)
+        wph = plots[plot](data, fmt='summary')  # weight plot
+
+        if plot in types_plots:
+            t_data = types_hlsmodel(hls_model)
+            types_plots[plot](t_data, fmt='summary')
+
+        plt.title("Distribution of (non-zero) weights" + after)
+        plt.tight_layout()
+
+    if X is not None:
+        print("Profiling activations" + before)
+        data = None
+        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
+            data = activations_keras(model, X, fmt='summary', plot=plot)
+        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
+            data = activations_torch(model, X, fmt='summary', plot=plot)
+
+        if data is not None:
+            ap = plots[plot](data, fmt='summary')  # activation plot
+            if hls_model_present and plot in types_plots:
+                t_data = activation_types_hlsmodel(hls_model_unoptimized)
+                types_plots[plot](t_data, fmt='summary')
+            plt.title("Distribution of (non-zero) activations" + before)
+            plt.tight_layout()
+
+        if hls_model_present:
+            print("Profiling activations" + after)
+            data = activations_hlsmodel(hls_model, X, fmt='summary', plot=plot)
+            aph = plots[plot](data, fmt='summary')
+
+            t_data = activation_types_hlsmodel(hls_model)
+            types_plots[plot](t_data, fmt='summary')
+
+            plt.title("Distribution of (non-zero) activations (final / after optimization)")
+            plt.tight_layout()
+
+    if hls_model_present and os.path.exists(tmp_output_dir):
+        shutil.rmtree(tmp_output_dir)
+
+    return wp, wph, ap, aph
+
+
+#########
+# COMPARE OUTPUT IMPLEMENTATION
+#########
+def _is_ignored_layer(layer):
+    """Some layers need to be ingored during inference"""
+    if isinstance(layer, (keras.layers.InputLayer, keras.layers.Dropout)):
+        return True
+    return False
+
+
+def _get_outputs(layers, X, model_input):
+    """Get outputs of intermediate layers"""
+    partial_models = keras.models.Model(inputs=model_input, outputs=[layer.output for layer in layers])
+    y = partial_models.predict(X)
+    return y
+
+
+def get_ymodel_keras(keras_model, X):
+    """Calculate each layer's ouput and put them into a dictionary.
+
+    Args:
+        keras_model (_type_): A keras Model
+        X (ndarray): Test data on which to evaluate the model to profile activations.
+            Must be formatted suitably for the ``model.predict(X)``.
+
+    Returns:
+        dict: A dictionary in the form {"layer_name": ouput array of layer}.
+    """
+    ymodel = {}
+    traced_layers = []
+    layer_names = []
+    for layer in keras_model.layers:
+        if _is_ignored_layer(layer):
+            continue
+        # If the layer has activation integrated then separate them
+        # Note that if the layer is a standalone activation layer then skip this
+        name = layer.name
+        if (
+            hasattr(layer, 'activation')
+            and layer.activation is not None
+            and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
+            and layer.activation.__name__ != 'linear'
+        ):
+            tmp_activation = layer.activation
+            layer.activation = None
+            ymodel.update({layer.name: _get_outputs([layer], X, keras_model.input)})
+            layer.activation = tmp_activation
+            name = layer.name + f"_{tmp_activation.__name__}"
+        traced_layers.append(layer)
+        layer_names.append(name)
+    outputs = _get_outputs(traced_layers, X, keras_model.input)
+    for name, output in zip(layer_names, outputs):
+        ymodel[name] = output
+    print("Done taking outputs for Keras model.")
+    return ymodel
+
+
+def _norm_diff(ymodel, ysim):
+    """Calculate the square root of the sum of the squares of the differences"""
+    diff = {}
+
+    for key in list(ysim.keys()):
+        diff[key] = np.linalg.norm(ysim[key] - ymodel[key])
+
+    # ---Bar Plot---
+    f, ax = plt.subplots()
+    plt.bar(list(diff.keys()), list(diff.values()))
+    plt.title("layer-by-layer output differences")
+    ax.set_ylabel('Norm of difference vector')
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+    return f
+
+
+def _dist_diff(ymodel, ysim):
+    """
+    Calculate the normalized distribution of the differences of the elements
+    of the output vectors.
+    If difference >= original value then the normalized difference will be set to 1,
+    meaning "very difference".
+    If difference < original value then the normalized difference would be difference/original.
+    """
+
+    diff = {}
+
+    for key in list(ysim.keys()):
+        flattened_ysim = ysim[key].flatten()
+        flattened_ymodel = np.array(ymodel[key]).flatten()
+
+        diff[key] = np.absolute(flattened_ymodel - flattened_ysim) / np.linalg.norm(flattened_ymodel - flattened_ysim)
+        diff_vector = np.absolute(flattened_ymodel - flattened_ysim)
+        abs_ymodel = np.absolute(flattened_ymodel)
+
+        normalized_diff = np.zeros(diff_vector.shape)
+        normalized_diff[(diff_vector >= abs_ymodel) & (abs_ymodel > 0) & (diff_vector > 0)] = 1
+
+        # Fill out the rest
+        index = diff_vector < abs_ymodel
+        normalized_diff[index] = diff_vector[index] / abs_ymodel[index]
+
+        diff[key] = normalized_diff
+
+    # ---Box Plot---
+    f, ax = plt.subplots()
+    pos = np.array(range(len(list(diff.values())))) + 1
+    ax.boxplot(list(diff.values()), sym='k+', positions=pos)
+
+    # --formatting
+    plt.title("Layer-by-layer distribution of output differences")
+    ax.set_xticklabels(list(diff.keys()))
+    ax.set_ylabel('Normalized difference')
+    ax.set_ylabel('Percent difference.')
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+
+    return f
+
+
+def compare(keras_model, hls_model, X, plot_type="dist_diff"):
+    """Compare each layer's output in keras and hls model. Note that the hls_model should not be compiled before using this.
+
+    Args:
+        keras_model: Original keras model.
+        hls_model (ModelGraph): Converted ModelGraph, with "Trace:True" in the configuration file.
+        X (ndarray): Input tensor for the model.
+        plot_type (str, optional): Different methods to visualize the y_model and y_sim differences.
+            Possible options include:
+            - 'norm_diff':: square root of the sum of the squares of the differences between each output vectors.
+            - 'dist_diff':: The normalized distribution of the differences of the elements between two output vectors.
+            Defaults to "dist_diff".
+
+    Returns:
+        matplotlib figure: Plot object of the histogram depicting the difference in each layer's output.
+    """
+
+    # Take in output from both models
+    # Note that each y is a dictionary with structure {"layer_name": flattened ouput array}
+    ymodel = get_ymodel_keras(keras_model, X)
+    _, ysim = hls_model.trace(X)
+
+    print("Plotting difference...")
+    f = plt.figure()
+    if plot_type == "norm_diff":
+        f = _norm_diff(ymodel, ysim)
+    elif plot_type == "dist_diff":
+        f = _dist_diff(ymodel, ysim)
+
+    return f
diff --git a/hls4ml/templates/quartus/ac_types/ac_channel.h b/hls4ml/templates/quartus/ac_types/ac_channel.h
index 62e0542736..96ff514ce4 100644
--- a/hls4ml/templates/quartus/ac_types/ac_channel.h
+++ b/hls4ml/templates/quartus/ac_types/ac_channel.h
@@ -1,555 +1,555 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_channel.h
-//  Description:    templatized channel communication class
-//  Author:         Andres Takach, Ph.D.
-*/
-
-#ifndef __AC_CHANNEL_H
-#define __AC_CHANNEL_H
-
-#ifndef __cplusplus
-# error C++ is required to include this header file
-#endif
-
-#include <iostream>
-#include <deque>
-
-#if !defined(AC_USER_DEFINED_ASSERT) && !defined(AC_ASSERT_THROW_EXCEPTION)
-# include <assert.h>
-#endif
-
-// not directly used by this include
-#include <stdio.h>
-#include <stdlib.h>
-
-// Macro Definitions (obsolete - provided here for backward compatibility)
-#define AC_CHAN_CTOR(varname) varname
-#define AC_CHAN_CTOR_INIT(varname,init) varname(init)
-#define AC_CHAN_CTOR_VAL(varname,init,val) varname(init,val)
-
-////////////////////////////////////////////////
-// Struct: ac_exception / ac_channel_exception
-////////////////////////////////////////////////
-
-#ifndef __INCLUDED_AC_EXCEPTION
-# define __INCLUDED_AC_EXCEPTION
-struct ac_exception {
-  const char *const file;
-  const unsigned int line;
-  const int code;
-  const char *const msg;
-  ac_exception(const char *file_, const unsigned int &line_, const int &code_, const char *msg_)
-    : file(file_), line(line_), code(code_), msg(msg_) { }
-};
-#endif
-
-struct ac_channel_exception {
-  enum { code_begin = 1024 };
-  enum code {
-    read_from_empty_channel                                     = code_begin,
-    fifo_not_empty_when_reset,
-    no_operator_sb_defined_for_channel_type,
-    no_insert_defined_for_channel_type,
-    no_size_in_connections,
-    no_num_free_in_connections,
-    no_output_empty_in_connections
-  };
-  static inline const char *msg(const code &code_) {
-      static const char *const s[] = {
-          "Read from empty channel",
-          "fifo not empty when reset",
-          "No operator[] defined for channel type",
-          "No insert defined for channel type",
-          "Connections does not support size()",
-          "Connections does not support num_free()",
-          "Connections::Out does not support empty()"
-      };
-      return s[code_-code_begin];
-  }
-};
-
-///////////////////////////////////////////
-// Class: ac_channel
-//////////////////////////////////////////
-
-template <class T>
-class ac_channel {
-public:
-  typedef T element_type;
-
-  // constructors
-  ac_channel();
-  ac_channel(int init);
-  ac_channel(int init, T val);
-
-  T read() { return chan.read(); }
-  void read(T& t) { t = read(); }
-  bool nb_read(T& t) { return chan.nb_read(t); }
-
-  void write(const T& t) { chan.write(t); }
-  bool nb_write(T& t) {
-    chan.incr_size_call_count();
-    return chan.nb_write(t);
-  }
-
-  unsigned int size() {
-    chan.incr_size_call_count();
-    return chan.size();
-  }
-  bool empty() { return chan.empty(); }
-
-  // Return true if channel has at least k entries
-  bool available(unsigned int k) const { return chan.available(k); }
-
-  void reset() { chan.reset(); }
-
-  unsigned int debug_size() const { return chan.size(); }
-
-  const T &operator[](unsigned int pos) const { return chan[pos]; }
-
-  int get_size_call_count() { return chan.get_size_call_count(); }
-
-#ifdef SYSTEMC_INCLUDED
-  void bind(sc_fifo_in<T> &f) { chan.bind(f); }
-  void bind(sc_fifo_out<T> &f) { chan.bind(f); }
-#endif
-
-#ifdef __CONNECTIONS__CONNECTIONS_H__
-  void bind(Connections::Out<T>& c)   { chan.bind(c); }
-  void bind(Connections::In<T>& c)    { chan.bind(c); }
-  void bind(Connections::SyncIn  &c)  { chan.bind(c); }
-  void bind(Connections::SyncOut &c)  { chan.bind(c); }
-#endif
-
-private:
-# ifndef AC_CHANNEL_ASSERT
-#   define AC_CHANNEL_ASSERT(cond, code) ac_assert(cond, __FILE__, __LINE__, code)
-    static inline void ac_assert(bool condition, const char *file, int line, const ac_channel_exception::code &code) {
-#     ifndef AC_USER_DEFINED_ASSERT
-        if(!condition) {
-          const ac_exception e(file, line, code, ac_channel_exception::msg(code));
-#        ifdef AC_ASSERT_THROW_EXCEPTION
-#         ifdef AC_ASSERT_THROW_EXCEPTION_AS_CONST_CHAR
-           throw(e.msg);
-#         else
-           throw(e);
-#         endif
-#        else
-          std::cerr << "Assert";
-          if(e.file)
-            std::cerr << " in file " << e.file << ":" << e.line;
-          std::cerr << " " << e.msg << std::endl;
-          assert(0);
-#        endif
-        }
-#     else
-        AC_USER_DEFINED_ASSERT(condition, file, line, ac_channel_exception::msg(code));
-#     endif
-    }
-# else
-#   error "private use only - AC_CHANNEL_ASSERT macro already defined"
-# endif
-
-public:
-  class fifo {
-    enum fifo_type {
-        fifo_ac_channel_type,
-        fifo_sc_fifo_type,
-        fifo_connections_type,
-        fifo_connections_sync_type
-    };
-
-    struct fifo_abstract {
-      virtual ~fifo_abstract() {}
-      virtual fifo_type get_fifo_type() const = 0;
-      virtual T read() = 0;
-      virtual bool nb_read(T& t) = 0;
-      virtual void write(const T& t) = 0;
-      virtual bool nb_write(T& t) = 0;
-      virtual bool empty() = 0;
-      virtual bool available(unsigned int k) const = 0;
-      virtual unsigned int size() const = 0;
-      virtual unsigned int num_free() const = 0;
-      virtual void reset() = 0;
-      virtual const T &operator_sb(const unsigned int &pos, const T &default_value) const = 0;
-    };
-
-    struct fifo_ac_channel : fifo_abstract {
-      std::deque<T> ch;
-
-      ~fifo_ac_channel() {}
-
-      static inline fifo_type ftype() { return fifo_ac_channel_type; }
-
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      T read() {
-        {
-          // If you hit this assert you attempted a read on an empty channel. Perhaps
-          // you need to guard the execution of the read with a call to the available()
-          // function:
-          //    if (myInputChan.available(2)) {
-          //      // it is safe to read two values
-          //      cout << myInputChan.read();
-          //      cout << myInputChan.read();
-          //    }
-          AC_CHANNEL_ASSERT(!empty(), ac_channel_exception::read_from_empty_channel);
-        }
-        T t = ch.front();
-        ch.pop_front();
-        return t;
-      }
-      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
-
-      void write(const T& t) { ch.push_back(t); }
-      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
-
-      bool empty() {  return size() == 0; }
-      bool available(unsigned int k) const { return size() >= k; }
-      unsigned int size() const { return (int)ch.size(); }
-      unsigned int num_free() const { return ch.max_size() - ch.size(); }
-
-      void reset() { ch.clear(); }
-
-      const T &operator_sb(const unsigned int &pos, const T &) const {
-        return ch[pos];
-      }
-    };
-
-#ifdef SYSTEMC_INCLUDED
-    struct fifo_sc_fifo : fifo_abstract {
-      sc_fifo_in<T> *fifo_in;
-      sc_fifo_out<T> *fifo_out;
-
-      ~fifo_sc_fifo() {}
-
-      static inline fifo_type ftype() { return fifo_sc_fifo_type; }
-
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      T read() { return fifo_in->read(); }
-      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
-
-      void write(const T& t) { fifo_out->write(t); }
-      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
-
-      bool empty() {  return size() == 0; }
-      bool available(unsigned int k) const { return size() >= k; }
-      unsigned int size() const { return fifo_in->num_available(); }
-      unsigned int num_free() const { return fifo_out->num_free(); }
-
-      void reset() {
-        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
-      }
-
-      const T &operator_sb(const unsigned int &, const T &default_value) const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
-        return default_value;
-      }
-    };
-public:
-    void bind(sc_fifo_in<T> &f) { get_fifo<fifo_sc_fifo>().fifo_in = &f; }
-    void bind(sc_fifo_out<T> &f) { get_fifo<fifo_sc_fifo>().fifo_out = &f; }
-private:
-#endif
-
-#ifdef __CONNECTIONS__CONNECTIONS_H__
-    struct fifo_connections : fifo_abstract {
-      Connections::In<T>  *fifo_in;
-      Connections::Out<T> *fifo_out;
-
-      ~fifo_connections() {}
-      static inline fifo_type ftype() { return fifo_connections_type; }
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      T read() { return fifo_in->Pop(); }
-      bool nb_read(T& t) { return fifo_in->PopNB(t); }
-
-      void write(const T& t) { fifo_out->Push(t); }
-      bool nb_write(T& t) { return fifo_out->PushNB(t); }
-
-      bool empty() {
-        if (fifo_in)
-          return fifo_in->Empty();
-        else
-          AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
-        return false;
-      }
-      bool available(unsigned int k) const { return true; }
-      unsigned int size() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
-        return 0;
-      }
-      unsigned int num_free() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
-        return 0;
-      }
-
-      void reset() {
-        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
-      }
-
-      const T &operator_sb(const unsigned int &, const T &default_value) const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
-        return default_value;
-      }
-    };
-
-    struct fifo_connections_sync : fifo_abstract {
-      Connections::SyncIn      *sync_in;
-      Connections::SyncOut     *sync_out;
-
-      ~fifo_connections_sync() {}
-      static inline fifo_type ftype() { return fifo_connections_sync_type; }
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      bool read() { sync_in->sync_in(); return true; }
-      bool nb_read(T& t) { t=true; return(sync_in->nb_sync_in()); }
-
-      void write(const T& t) { sync_out->sync_out(); }
-      bool nb_write(T& t) { sync_out->sync_out(); return true; }
-
-      bool empty() {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
-        return(false);
-      }
-      bool available(unsigned int k) const { return true; }
-      unsigned int size() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
-        return 0;
-      }
-      unsigned int num_free() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
-        return 0;
-      }
-      void reset() {
-        if (sync_in) sync_in->reset_sync_in();
-        if (sync_out) sync_out->reset_sync_out();
-      }
-      const T &operator_sb(const unsigned int &, const T &default_value) const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
-        return default_value;
-      }
-    };
-
-
-  public:
-    void bind(Connections::In<T>& c) { get_fifo<fifo_connections>().fifo_in = &c; }
-    void bind(Connections::Out<T>& c) { get_fifo<fifo_connections>().fifo_out = &c; }
-
-    void bind(Connections::SyncIn  &c)  { get_fifo<fifo_connections_sync>().sync_in = &c; }
-    void bind(Connections::SyncOut &c)  { get_fifo<fifo_connections_sync>().sync_out = &c; }
-
-  private:
-#endif
-
-    template<typename fifo_T>
-    fifo_T &get_fifo() {
-      if (!f || f->get_fifo_type() != fifo_T::ftype()) {
-        if (f) {
-          AC_CHANNEL_ASSERT(f->empty(), ac_channel_exception::fifo_not_empty_when_reset);
-          delete f;
-        }
-        f = new fifo_T;
-      }
-      return static_cast<fifo_T &>(*f);
-    }
-
-    fifo_abstract *f;
-    unsigned int rSz;    // reset size
-    T rVal;              // resetValue
-    int size_call_count;
-
-  public:
-    fifo() : f(0), rSz(0), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
-    fifo(int init) : f(0), rSz(init), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
-    fifo(int init, T val) : f(0), rSz(init), rVal(val), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
-    ~fifo() { delete f; }
-
-    inline T read() { return f->read(); }
-    inline bool nb_read(T& t) { return f->nb_read(t); }
-
-    inline void write(const T& t) { f->write(t); }
-    inline bool nb_write(T& t) { return f->nb_write(t); }
-
-    inline bool empty() { return f->empty(); }
-    inline bool available(unsigned int k) const { return f->available(k); }
-    inline unsigned int size() const { return f->size(); }
-    inline unsigned int num_free() const { return f->num_free(); }
-
-    inline void reset() {
-      f->reset();
-      for (int i=0; i<(int)rSz; i++)
-        write(rVal);
-    }
-
-    inline const T &operator[](unsigned int pos) const { return f->operator_sb(pos, rVal); }
-
-    void incr_size_call_count() { ++size_call_count; }
-    int get_size_call_count() {
-      int tmp=size_call_count;
-      size_call_count=0;
-      return tmp;
-    }
-
-    // obsolete - provided here for backward compatibility with ac_channel
-    struct iterator {
-      iterator operator+(unsigned int pos_) const {
-        return iterator(itr, pos_);
-      }
-    private:
-      friend class fifo;
-      iterator(const typename std::deque<T>::iterator &itr_, unsigned int pos=0)
-        : itr(itr_) { if (pos) itr += pos; }
-      typename std::deque<T>::iterator itr;
-    };
-    iterator begin() {
-      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
-      return iterator(get_fifo<fifo_ac_channel>().ch.begin());
-    }
-    void insert(iterator itr, const T& t) {
-      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
-      get_fifo<fifo_ac_channel>().ch.insert(itr.itr,t);
-    }
-  };
-  fifo chan;
-
-private:
-  // Prevent the compiler from autogenerating these.
-  //  (This enforces that channels are always passed by reference.)
-  ac_channel(const ac_channel< T >&);
-  ac_channel& operator=(const ac_channel< T >&);
-};
-
-template <class T>
-ac_channel<T>::ac_channel() : chan() {}
-
-template <class T>
-ac_channel<T>::ac_channel(int init) : chan(init)
-{
-  for (int i=init; i>0; i--) {
-    T dc;
-    write(dc);
-  }
-}
-
-template <class T>
-ac_channel<T>::ac_channel(int init, T val) : chan(init, val)
-{
-  for (int i=init; i>0; i--)
-    write(val);
-}
-
-template<class T>
-inline std::ostream& operator<< (std::ostream& os, ac_channel<T> &a)
-{
-  for (unsigned int i=0; i<a.size(); i++) {
-    if (i > 0) os << " ";
-    os << a[i];
-  }
-  return os;
-}
-
-// This general case is meant to cover non channel (or array of them) args
-//   Its result will be ignored
-template<typename T>
-bool nb_read_chan_rdy(T &x) { return true; }
-
-template<typename T>
-bool nb_read_chan_rdy(ac_channel<T> &chan) { return !chan.empty(); }
-
-template<typename T, int N>
-bool nb_read_chan_rdy(ac_channel<T> (&chan)[N] ) {
-  bool r = true;
-  for(int i=0; i<N; i++)
-    r &= !chan[i].empty();
-  return r;
-}
-
-#if __cplusplus > 199711L
-template<typename ...Args>
-bool nb_read_chan_rdy(Args&... args) {
-  const int n_args = sizeof...(args);
-  // only every other arg is a channel (or an array of channels)
-  bool rdy[n_args] = { (nb_read_chan_rdy(args))... };
-  bool r = true;
-  for(int i=0; i < n_args; i+=2)
-    r &= rdy[i];
-  return r;
-}
-#endif
-
-template<typename T>
-void nb_read_r(ac_channel<T> &chan, T &var) {
-  chan.nb_read(var);
-}
-
-template<typename T, int N>
-void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N]) {
-  for(int i=0; i<N; i++)
-    chan[i].nb_read(var[i]);
-}
-
-#if __cplusplus > 199711L
-template<typename T, typename ...Args>
-void nb_read_r(ac_channel<T> &chan, T &var, Args&... args) {
-  chan.nb_read(var);
-  nb_read_r(args...);
-}
-
-template<typename T, int N, typename ...Args>
-void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N], Args&... args) {
-  for(int i=0; i<N; i++)
-    chan[i].nb_read(var[i]);
-  nb_read_r(args...);
-}
-
-template<typename ...Args>
-bool nb_read_join(Args&... args) {
-  if(nb_read_chan_rdy(args...)) {
-    nb_read_r(args...);
-    return true;
-  }
-  return false;
-}
-#endif
-
-/* undo macro adjustments */
-#ifdef AC_CHANNEL_ASSERT
-#  undef AC_CHANNEL_ASSERT
-#endif
-
-#endif
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_channel.h
+//  Description:    templatized channel communication class
+//  Author:         Andres Takach, Ph.D.
+*/
+
+#ifndef __AC_CHANNEL_H
+#define __AC_CHANNEL_H
+
+#ifndef __cplusplus
+# error C++ is required to include this header file
+#endif
+
+#include <iostream>
+#include <deque>
+
+#if !defined(AC_USER_DEFINED_ASSERT) && !defined(AC_ASSERT_THROW_EXCEPTION)
+# include <assert.h>
+#endif
+
+// not directly used by this include
+#include <stdio.h>
+#include <stdlib.h>
+
+// Macro Definitions (obsolete - provided here for backward compatibility)
+#define AC_CHAN_CTOR(varname) varname
+#define AC_CHAN_CTOR_INIT(varname,init) varname(init)
+#define AC_CHAN_CTOR_VAL(varname,init,val) varname(init,val)
+
+////////////////////////////////////////////////
+// Struct: ac_exception / ac_channel_exception
+////////////////////////////////////////////////
+
+#ifndef __INCLUDED_AC_EXCEPTION
+# define __INCLUDED_AC_EXCEPTION
+struct ac_exception {
+  const char *const file;
+  const unsigned int line;
+  const int code;
+  const char *const msg;
+  ac_exception(const char *file_, const unsigned int &line_, const int &code_, const char *msg_)
+    : file(file_), line(line_), code(code_), msg(msg_) { }
+};
+#endif
+
+struct ac_channel_exception {
+  enum { code_begin = 1024 };
+  enum code {
+    read_from_empty_channel                                     = code_begin,
+    fifo_not_empty_when_reset,
+    no_operator_sb_defined_for_channel_type,
+    no_insert_defined_for_channel_type,
+    no_size_in_connections,
+    no_num_free_in_connections,
+    no_output_empty_in_connections
+  };
+  static inline const char *msg(const code &code_) {
+      static const char *const s[] = {
+          "Read from empty channel",
+          "fifo not empty when reset",
+          "No operator[] defined for channel type",
+          "No insert defined for channel type",
+          "Connections does not support size()",
+          "Connections does not support num_free()",
+          "Connections::Out does not support empty()"
+      };
+      return s[code_-code_begin];
+  }
+};
+
+///////////////////////////////////////////
+// Class: ac_channel
+//////////////////////////////////////////
+
+template <class T>
+class ac_channel {
+public:
+  typedef T element_type;
+
+  // constructors
+  ac_channel();
+  ac_channel(int init);
+  ac_channel(int init, T val);
+
+  T read() { return chan.read(); }
+  void read(T& t) { t = read(); }
+  bool nb_read(T& t) { return chan.nb_read(t); }
+
+  void write(const T& t) { chan.write(t); }
+  bool nb_write(T& t) {
+    chan.incr_size_call_count();
+    return chan.nb_write(t);
+  }
+
+  unsigned int size() {
+    chan.incr_size_call_count();
+    return chan.size();
+  }
+  bool empty() { return chan.empty(); }
+
+  // Return true if channel has at least k entries
+  bool available(unsigned int k) const { return chan.available(k); }
+
+  void reset() { chan.reset(); }
+
+  unsigned int debug_size() const { return chan.size(); }
+
+  const T &operator[](unsigned int pos) const { return chan[pos]; }
+
+  int get_size_call_count() { return chan.get_size_call_count(); }
+
+#ifdef SYSTEMC_INCLUDED
+  void bind(sc_fifo_in<T> &f) { chan.bind(f); }
+  void bind(sc_fifo_out<T> &f) { chan.bind(f); }
+#endif
+
+#ifdef __CONNECTIONS__CONNECTIONS_H__
+  void bind(Connections::Out<T>& c)   { chan.bind(c); }
+  void bind(Connections::In<T>& c)    { chan.bind(c); }
+  void bind(Connections::SyncIn  &c)  { chan.bind(c); }
+  void bind(Connections::SyncOut &c)  { chan.bind(c); }
+#endif
+
+private:
+# ifndef AC_CHANNEL_ASSERT
+#   define AC_CHANNEL_ASSERT(cond, code) ac_assert(cond, __FILE__, __LINE__, code)
+    static inline void ac_assert(bool condition, const char *file, int line, const ac_channel_exception::code &code) {
+#     ifndef AC_USER_DEFINED_ASSERT
+        if(!condition) {
+          const ac_exception e(file, line, code, ac_channel_exception::msg(code));
+#        ifdef AC_ASSERT_THROW_EXCEPTION
+#         ifdef AC_ASSERT_THROW_EXCEPTION_AS_CONST_CHAR
+           throw(e.msg);
+#         else
+           throw(e);
+#         endif
+#        else
+          std::cerr << "Assert";
+          if(e.file)
+            std::cerr << " in file " << e.file << ":" << e.line;
+          std::cerr << " " << e.msg << std::endl;
+          assert(0);
+#        endif
+        }
+#     else
+        AC_USER_DEFINED_ASSERT(condition, file, line, ac_channel_exception::msg(code));
+#     endif
+    }
+# else
+#   error "private use only - AC_CHANNEL_ASSERT macro already defined"
+# endif
+
+public:
+  class fifo {
+    enum fifo_type {
+        fifo_ac_channel_type,
+        fifo_sc_fifo_type,
+        fifo_connections_type,
+        fifo_connections_sync_type
+    };
+
+    struct fifo_abstract {
+      virtual ~fifo_abstract() {}
+      virtual fifo_type get_fifo_type() const = 0;
+      virtual T read() = 0;
+      virtual bool nb_read(T& t) = 0;
+      virtual void write(const T& t) = 0;
+      virtual bool nb_write(T& t) = 0;
+      virtual bool empty() = 0;
+      virtual bool available(unsigned int k) const = 0;
+      virtual unsigned int size() const = 0;
+      virtual unsigned int num_free() const = 0;
+      virtual void reset() = 0;
+      virtual const T &operator_sb(const unsigned int &pos, const T &default_value) const = 0;
+    };
+
+    struct fifo_ac_channel : fifo_abstract {
+      std::deque<T> ch;
+
+      ~fifo_ac_channel() {}
+
+      static inline fifo_type ftype() { return fifo_ac_channel_type; }
+
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      T read() {
+        {
+          // If you hit this assert you attempted a read on an empty channel. Perhaps
+          // you need to guard the execution of the read with a call to the available()
+          // function:
+          //    if (myInputChan.available(2)) {
+          //      // it is safe to read two values
+          //      cout << myInputChan.read();
+          //      cout << myInputChan.read();
+          //    }
+          AC_CHANNEL_ASSERT(!empty(), ac_channel_exception::read_from_empty_channel);
+        }
+        T t = ch.front();
+        ch.pop_front();
+        return t;
+      }
+      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
+
+      void write(const T& t) { ch.push_back(t); }
+      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
+
+      bool empty() {  return size() == 0; }
+      bool available(unsigned int k) const { return size() >= k; }
+      unsigned int size() const { return (int)ch.size(); }
+      unsigned int num_free() const { return ch.max_size() - ch.size(); }
+
+      void reset() { ch.clear(); }
+
+      const T &operator_sb(const unsigned int &pos, const T &) const {
+        return ch[pos];
+      }
+    };
+
+#ifdef SYSTEMC_INCLUDED
+    struct fifo_sc_fifo : fifo_abstract {
+      sc_fifo_in<T> *fifo_in;
+      sc_fifo_out<T> *fifo_out;
+
+      ~fifo_sc_fifo() {}
+
+      static inline fifo_type ftype() { return fifo_sc_fifo_type; }
+
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      T read() { return fifo_in->read(); }
+      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
+
+      void write(const T& t) { fifo_out->write(t); }
+      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
+
+      bool empty() {  return size() == 0; }
+      bool available(unsigned int k) const { return size() >= k; }
+      unsigned int size() const { return fifo_in->num_available(); }
+      unsigned int num_free() const { return fifo_out->num_free(); }
+
+      void reset() {
+        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
+      }
+
+      const T &operator_sb(const unsigned int &, const T &default_value) const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
+        return default_value;
+      }
+    };
+public:
+    void bind(sc_fifo_in<T> &f) { get_fifo<fifo_sc_fifo>().fifo_in = &f; }
+    void bind(sc_fifo_out<T> &f) { get_fifo<fifo_sc_fifo>().fifo_out = &f; }
+private:
+#endif
+
+#ifdef __CONNECTIONS__CONNECTIONS_H__
+    struct fifo_connections : fifo_abstract {
+      Connections::In<T>  *fifo_in;
+      Connections::Out<T> *fifo_out;
+
+      ~fifo_connections() {}
+      static inline fifo_type ftype() { return fifo_connections_type; }
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      T read() { return fifo_in->Pop(); }
+      bool nb_read(T& t) { return fifo_in->PopNB(t); }
+
+      void write(const T& t) { fifo_out->Push(t); }
+      bool nb_write(T& t) { return fifo_out->PushNB(t); }
+
+      bool empty() {
+        if (fifo_in)
+          return fifo_in->Empty();
+        else
+          AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
+        return false;
+      }
+      bool available(unsigned int k) const { return true; }
+      unsigned int size() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
+        return 0;
+      }
+      unsigned int num_free() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
+        return 0;
+      }
+
+      void reset() {
+        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
+      }
+
+      const T &operator_sb(const unsigned int &, const T &default_value) const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
+        return default_value;
+      }
+    };
+
+    struct fifo_connections_sync : fifo_abstract {
+      Connections::SyncIn      *sync_in;
+      Connections::SyncOut     *sync_out;
+
+      ~fifo_connections_sync() {}
+      static inline fifo_type ftype() { return fifo_connections_sync_type; }
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      bool read() { sync_in->sync_in(); return true; }
+      bool nb_read(T& t) { t=true; return(sync_in->nb_sync_in()); }
+
+      void write(const T& t) { sync_out->sync_out(); }
+      bool nb_write(T& t) { sync_out->sync_out(); return true; }
+
+      bool empty() {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
+        return(false);
+      }
+      bool available(unsigned int k) const { return true; }
+      unsigned int size() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
+        return 0;
+      }
+      unsigned int num_free() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
+        return 0;
+      }
+      void reset() {
+        if (sync_in) sync_in->reset_sync_in();
+        if (sync_out) sync_out->reset_sync_out();
+      }
+      const T &operator_sb(const unsigned int &, const T &default_value) const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
+        return default_value;
+      }
+    };
+
+
+  public:
+    void bind(Connections::In<T>& c) { get_fifo<fifo_connections>().fifo_in = &c; }
+    void bind(Connections::Out<T>& c) { get_fifo<fifo_connections>().fifo_out = &c; }
+
+    void bind(Connections::SyncIn  &c)  { get_fifo<fifo_connections_sync>().sync_in = &c; }
+    void bind(Connections::SyncOut &c)  { get_fifo<fifo_connections_sync>().sync_out = &c; }
+
+  private:
+#endif
+
+    template<typename fifo_T>
+    fifo_T &get_fifo() {
+      if (!f || f->get_fifo_type() != fifo_T::ftype()) {
+        if (f) {
+          AC_CHANNEL_ASSERT(f->empty(), ac_channel_exception::fifo_not_empty_when_reset);
+          delete f;
+        }
+        f = new fifo_T;
+      }
+      return static_cast<fifo_T &>(*f);
+    }
+
+    fifo_abstract *f;
+    unsigned int rSz;    // reset size
+    T rVal;              // resetValue
+    int size_call_count;
+
+  public:
+    fifo() : f(0), rSz(0), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
+    fifo(int init) : f(0), rSz(init), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
+    fifo(int init, T val) : f(0), rSz(init), rVal(val), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
+    ~fifo() { delete f; }
+
+    inline T read() { return f->read(); }
+    inline bool nb_read(T& t) { return f->nb_read(t); }
+
+    inline void write(const T& t) { f->write(t); }
+    inline bool nb_write(T& t) { return f->nb_write(t); }
+
+    inline bool empty() { return f->empty(); }
+    inline bool available(unsigned int k) const { return f->available(k); }
+    inline unsigned int size() const { return f->size(); }
+    inline unsigned int num_free() const { return f->num_free(); }
+
+    inline void reset() {
+      f->reset();
+      for (int i=0; i<(int)rSz; i++)
+        write(rVal);
+    }
+
+    inline const T &operator[](unsigned int pos) const { return f->operator_sb(pos, rVal); }
+
+    void incr_size_call_count() { ++size_call_count; }
+    int get_size_call_count() {
+      int tmp=size_call_count;
+      size_call_count=0;
+      return tmp;
+    }
+
+    // obsolete - provided here for backward compatibility with ac_channel
+    struct iterator {
+      iterator operator+(unsigned int pos_) const {
+        return iterator(itr, pos_);
+      }
+    private:
+      friend class fifo;
+      iterator(const typename std::deque<T>::iterator &itr_, unsigned int pos=0)
+        : itr(itr_) { if (pos) itr += pos; }
+      typename std::deque<T>::iterator itr;
+    };
+    iterator begin() {
+      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
+      return iterator(get_fifo<fifo_ac_channel>().ch.begin());
+    }
+    void insert(iterator itr, const T& t) {
+      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
+      get_fifo<fifo_ac_channel>().ch.insert(itr.itr,t);
+    }
+  };
+  fifo chan;
+
+private:
+  // Prevent the compiler from autogenerating these.
+  //  (This enforces that channels are always passed by reference.)
+  ac_channel(const ac_channel< T >&);
+  ac_channel& operator=(const ac_channel< T >&);
+};
+
+template <class T>
+ac_channel<T>::ac_channel() : chan() {}
+
+template <class T>
+ac_channel<T>::ac_channel(int init) : chan(init)
+{
+  for (int i=init; i>0; i--) {
+    T dc;
+    write(dc);
+  }
+}
+
+template <class T>
+ac_channel<T>::ac_channel(int init, T val) : chan(init, val)
+{
+  for (int i=init; i>0; i--)
+    write(val);
+}
+
+template<class T>
+inline std::ostream& operator<< (std::ostream& os, ac_channel<T> &a)
+{
+  for (unsigned int i=0; i<a.size(); i++) {
+    if (i > 0) os << " ";
+    os << a[i];
+  }
+  return os;
+}
+
+// This general case is meant to cover non channel (or array of them) args
+//   Its result will be ignored
+template<typename T>
+bool nb_read_chan_rdy(T &x) { return true; }
+
+template<typename T>
+bool nb_read_chan_rdy(ac_channel<T> &chan) { return !chan.empty(); }
+
+template<typename T, int N>
+bool nb_read_chan_rdy(ac_channel<T> (&chan)[N] ) {
+  bool r = true;
+  for(int i=0; i<N; i++)
+    r &= !chan[i].empty();
+  return r;
+}
+
+#if __cplusplus > 199711L
+template<typename ...Args>
+bool nb_read_chan_rdy(Args&... args) {
+  const int n_args = sizeof...(args);
+  // only every other arg is a channel (or an array of channels)
+  bool rdy[n_args] = { (nb_read_chan_rdy(args))... };
+  bool r = true;
+  for(int i=0; i < n_args; i+=2)
+    r &= rdy[i];
+  return r;
+}
+#endif
+
+template<typename T>
+void nb_read_r(ac_channel<T> &chan, T &var) {
+  chan.nb_read(var);
+}
+
+template<typename T, int N>
+void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N]) {
+  for(int i=0; i<N; i++)
+    chan[i].nb_read(var[i]);
+}
+
+#if __cplusplus > 199711L
+template<typename T, typename ...Args>
+void nb_read_r(ac_channel<T> &chan, T &var, Args&... args) {
+  chan.nb_read(var);
+  nb_read_r(args...);
+}
+
+template<typename T, int N, typename ...Args>
+void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N], Args&... args) {
+  for(int i=0; i<N; i++)
+    chan[i].nb_read(var[i]);
+  nb_read_r(args...);
+}
+
+template<typename ...Args>
+bool nb_read_join(Args&... args) {
+  if(nb_read_chan_rdy(args...)) {
+    nb_read_r(args...);
+    return true;
+  }
+  return false;
+}
+#endif
+
+/* undo macro adjustments */
+#ifdef AC_CHANNEL_ASSERT
+#  undef AC_CHANNEL_ASSERT
+#endif
+
+#endif
diff --git a/hls4ml/templates/quartus/ac_types/ac_complex.h b/hls4ml/templates/quartus/ac_types/ac_complex.h
index 56821a053d..555b4c89d2 100644
--- a/hls4ml/templates/quartus/ac_types/ac_complex.h
+++ b/hls4ml/templates/quartus/ac_types/ac_complex.h
@@ -1,445 +1,445 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2008-2019, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_complex.h
-//  Description:    complex type with parameterized type that can be:
-//                    - C integer types
-//                    - C floating point types
-//                    - ac_int
-//                    - ac_fixed
-//                    - ac_float
-//                  ac_complex based on C integers, ac_int, ac_fixed and ac_float can
-//                  be mixed
-//  Author:         Andres Takach, Ph.D.
-*/
-
-#ifndef __AC_COMPLEX_H
-#define __AC_COMPLEX_H
-
-#include <ac_float.h>
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-template<typename T> class ac_complex;
-
-namespace ac_private {
-  // specializations after definition of ac_complex
-  template<typename T>
-  struct rt_ac_complex_T {
-    template<typename T2>
-    struct op1 {
-      typedef typename T::template rt_T< ac_complex<T2> >::mult mult;
-      typedef typename T::template rt_T< ac_complex<T2> >::plus plus;
-      typedef typename T::template rt_T< ac_complex<T2> >::minus2 minus;
-      typedef typename T::template rt_T< ac_complex<T2> >::minus minus2;
-      typedef typename T::template rt_T< ac_complex<T2> >::logic logic;
-      typedef typename T::template rt_T< ac_complex<T2> >::div2 div;
-      typedef typename T::template rt_T< ac_complex<T2> >::div div2;
-    };
-  };
-}  // namespace ac_private
-
-template<typename T>
-class ac_complex {
-public:   // temporary workaround
-  T _r;
-  T _i;
-  typedef typename ac_private::map<T>::t map_T;
-  typedef typename map_T::rt_unary::mag_sqr T_sqr;
-  typedef typename ac_private::map<T_sqr>::t map_T_sqr;
-  typedef typename ac_private::map<typename map_T::rt_unary::mag>::t map_T_mag;
-public:
-  typedef T element_type;
-  template<typename T2>
-  struct rt_T {
-    typedef typename ac_private::map<T2>::t map_T2;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::mult mult;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::plus plus;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus minus;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus2 minus2;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::logic logic;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div div;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div2 div2;
-    typedef ac_complex<T> arg1;
-  };
-
-  struct rt_unary {
-    typedef typename map_T_sqr::template rt_T<map_T_sqr>::plus  mag_sqr;
-    typedef typename map_T_mag::template rt_T<map_T_mag>::plus  mag;   // overly conservative for signed
-    typedef ac_complex<typename map_T::rt_unary::neg>  neg;
-    template<unsigned N>
-    struct set {
-      typedef ac_complex<typename map_T::rt_unary::template set<N>::sum> sum;
-    };
-  };
-
-  ac_complex() { }
-  template<typename T2>
-  ac_complex(const ac_complex<T2> &c) : _r(c.r()), _i(c.i()) {}
-  template<typename T2>
-  ac_complex(const T2 &r) : _r(r), _i(0) {}
-  template<typename T2, typename T3>
-  ac_complex(const T2 &r, const T3 &i) : _r(r), _i(i) {}
-  const T &r() const { return _r; }
-  const T &i() const { return _i; }
-  T &r() { return _r; }
-  T &i() { return _i; }
-  const T &real() const { return _r; }
-  const T &imag() const { return _i; }
-  T &real() { return _r; }
-  T &imag() { return _i; }
-  template<typename T2>
-  void set_r(const T2 &r) { _r = r;}
-  template<typename T2>
-  void set_i(const T2 &i) { _i = i;}
-
-  // const binary operators are global rather than members because of compiler errors due to ambiguity
-  // (would appear as a compiler bug)
-
-  template<typename T2>
-  ac_complex &operator +=(const ac_complex<T2> &op2) {
-    _r += op2.r();
-    _i += op2.i();
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator +=(const T2 &op2) {
-    _r += op2;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator -=(const ac_complex<T2> &op2) {
-    _r -= op2.r();
-    _i -= op2.i();
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator -=(const T2 &op2) {
-    _r -= op2;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator *=(const ac_complex<T2> &op2) {
-    T r0 = _r*op2.r() - _i*op2.i();
-    _i = _r*op2.i() + _i*op2.r();
-    _r = r0;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator *=(const T2 &op2) {
-    _r = _r*op2;
-    _i = _i*op2;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator /=(const ac_complex<T2> &op2) {
-    typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
-    T r0 = (_r*op2.r() + _i*op2.i())/d;
-    _i = (_i*op2.r() - _r*op2.i())/d;
-    _r = r0;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator /=(const T2 &op2) {
-    _r = _r/op2;
-    _i = _i/op2;
-    return *this;
-  }
-
-  // Arithmetic Unary --------------------------------------------------------
-  ac_complex operator +() {
-    return *this;
-  }
-  typename rt_unary::neg operator -() const {
-    typename rt_unary::neg res(-_r, -_i);
-    return res;
-  }
-
-  // ! ------------------------------------------------------------------------
-  bool operator ! () const {
-    return !_r && !_i;
-  }
-
-  typename rt_unary::neg conj() const {
-    typename rt_unary::neg res(_r, -_i);
-    return res;
-  }
-
-  typename rt_unary::mag_sqr mag_sqr() const {
-    return _r*_r + _i*_i;
-  }
-
-  ac_complex< ac_int<2,true> > sign_conj() const {
-    return ac_complex< ac_int<2,true> >(
-      _r ? (_r < 0 ? -1 : 1) : 0,
-      _i ? (_i < 0 ? 1 : -1) : 0
-    );
-  }
-
-  inline static std::string type_name() {
-    typedef typename ac_private::map<T>::t map_T;
-    std::string r = "ac_complex<";
-    r += map_T::type_name();
-    r += '>';
-    return r;
-  }
-
-};
-
-namespace ac_private {
-  // with T2 == ac_complex
-  template<typename T2>
-  struct rt_ac_complex_T< ac_complex<T2> > {
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename ac::rt_2T<T,T2>::plus> plus;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::minus> minus;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::minus2> minus2;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::logic> logic;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::div> div;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::div2> div2;
-      typedef ac_complex<typename ac::rt_2T<
-          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::plus,
-          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::minus
-        >::logic> mult;
-    };
-  };
-  // with T2 == ac_float
-  template< AC_FL_T0(2) >
-  struct rt_ac_complex_T< AC_FL0(2) > {
-    typedef AC_FL0(2) T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-  // with T2 == ac_fixed
-  template<int W2, int I2, bool S2>
-  struct rt_ac_complex_T< ac_fixed<W2,I2,S2> > {
-    typedef ac_fixed<W2,I2,S2> T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-  // with T2 == ac_int
-  template<int W2, bool S2>
-  struct rt_ac_complex_T< ac_int<W2,S2> > {
-    typedef ac_int<W2,S2> T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-  // with T2 == c_type<TC>
-  template<typename TC>
-  struct rt_ac_complex_T< c_type<TC> > {
-    typedef c_type<TC> T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus operator +(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus res( op.r() + op2.r(), op.i() + op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::plus operator +(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::template rt_T<T>::plus res( op + op2.r(), op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::plus operator +(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::plus res( op.r() + op2, op.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus operator -(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus res( op.r() - op2.r(), op.i() - op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::minus2 operator -(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::template rt_T<T>::minus2 res( op - op2.r(), -op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::minus operator -(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::minus res( op.r() - op2, op.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult operator *(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult res( op.r()*op2.r() - op.i()*op2.i(), op.i()*op2.r() + op.r()*op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::mult operator *(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::template rt_T<T>::mult res( op*op2.r(), op*op2.i());
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::mult operator *(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::mult res( op.r()*op2, op.i()*op2 );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::div operator /(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::div res((op.r()*op2.r() + op.i()*op2.i())/d, (op.i()*op2.r() - op.r()*op2.i())/d);
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::div operator /(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::div res( op.r()/op2, op.i()/op2 );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::div2 operator /(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
-  typename ac_complex<T2>::template rt_T<T>::div2 res(op*op2.r()/d, - op*op2.i()/d);
-  return res;
-}
-
-template<typename T, typename T2>
-inline bool operator == (const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  return op.r() == op2.r() && op.i() == op2.i();
-}
-
-template<typename T, typename T2>
-inline bool operator == (const T &op, const ac_complex<T2> &op2) {
-  return op == op2.r() && op2.i() == 0;
-}
-
-template<typename T, typename T2>
-inline bool operator == (const ac_complex<T> &op, const T2 &op2) {
-  return op.r() == op2 && op.i() == 0;
-}
-
-template<typename T, typename T2>
-inline bool operator != (const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  return op.r() != op2.r() || op.i() != op2.i();
-}
-
-template<typename T, typename T2>
-inline bool operator != (const T &op, const ac_complex<T2> &op2) {
-  return op != op2.r() || op2.i() != 0;
-}
-
-template<typename T, typename T2>
-inline bool operator != (const ac_complex<T> &op, const T2 &op2) {
-  return op.r() != op2 || op.i() != 0;
-}
-
-// Stream --------------------------------------------------------------------
-
-template<typename T>
-inline std::ostream& operator << (std::ostream &os, const ac_complex<T> &x) {
-#ifndef __SYNTHESIS__
-  os << "(" << x.r() << ", " << x.i() << ")";
-#endif
-  return os;
-}
-
-template<ac_special_val V, typename T>
-inline ac_complex<T> value(ac_complex<T>) {
-  T val = value<V>((T) 0);
-  ac_complex<T> r(val, val);
-  return r;
-}
-
-namespace ac {
-  template<ac_special_val V, typename T>
-  inline bool init_array(ac_complex<T> *a, int n) {
-    T val = value<V>((T) 0);
-    ac_complex<T> t(val, val);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-}
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_COMPLEX_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2008-2019, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_complex.h
+//  Description:    complex type with parameterized type that can be:
+//                    - C integer types
+//                    - C floating point types
+//                    - ac_int
+//                    - ac_fixed
+//                    - ac_float
+//                  ac_complex based on C integers, ac_int, ac_fixed and ac_float can
+//                  be mixed
+//  Author:         Andres Takach, Ph.D.
+*/
+
+#ifndef __AC_COMPLEX_H
+#define __AC_COMPLEX_H
+
+#include <ac_float.h>
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+template<typename T> class ac_complex;
+
+namespace ac_private {
+  // specializations after definition of ac_complex
+  template<typename T>
+  struct rt_ac_complex_T {
+    template<typename T2>
+    struct op1 {
+      typedef typename T::template rt_T< ac_complex<T2> >::mult mult;
+      typedef typename T::template rt_T< ac_complex<T2> >::plus plus;
+      typedef typename T::template rt_T< ac_complex<T2> >::minus2 minus;
+      typedef typename T::template rt_T< ac_complex<T2> >::minus minus2;
+      typedef typename T::template rt_T< ac_complex<T2> >::logic logic;
+      typedef typename T::template rt_T< ac_complex<T2> >::div2 div;
+      typedef typename T::template rt_T< ac_complex<T2> >::div div2;
+    };
+  };
+}  // namespace ac_private
+
+template<typename T>
+class ac_complex {
+public:   // temporary workaround
+  T _r;
+  T _i;
+  typedef typename ac_private::map<T>::t map_T;
+  typedef typename map_T::rt_unary::mag_sqr T_sqr;
+  typedef typename ac_private::map<T_sqr>::t map_T_sqr;
+  typedef typename ac_private::map<typename map_T::rt_unary::mag>::t map_T_mag;
+public:
+  typedef T element_type;
+  template<typename T2>
+  struct rt_T {
+    typedef typename ac_private::map<T2>::t map_T2;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::mult mult;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::plus plus;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus minus;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus2 minus2;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::logic logic;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div div;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div2 div2;
+    typedef ac_complex<T> arg1;
+  };
+
+  struct rt_unary {
+    typedef typename map_T_sqr::template rt_T<map_T_sqr>::plus  mag_sqr;
+    typedef typename map_T_mag::template rt_T<map_T_mag>::plus  mag;   // overly conservative for signed
+    typedef ac_complex<typename map_T::rt_unary::neg>  neg;
+    template<unsigned N>
+    struct set {
+      typedef ac_complex<typename map_T::rt_unary::template set<N>::sum> sum;
+    };
+  };
+
+  ac_complex() { }
+  template<typename T2>
+  ac_complex(const ac_complex<T2> &c) : _r(c.r()), _i(c.i()) {}
+  template<typename T2>
+  ac_complex(const T2 &r) : _r(r), _i(0) {}
+  template<typename T2, typename T3>
+  ac_complex(const T2 &r, const T3 &i) : _r(r), _i(i) {}
+  const T &r() const { return _r; }
+  const T &i() const { return _i; }
+  T &r() { return _r; }
+  T &i() { return _i; }
+  const T &real() const { return _r; }
+  const T &imag() const { return _i; }
+  T &real() { return _r; }
+  T &imag() { return _i; }
+  template<typename T2>
+  void set_r(const T2 &r) { _r = r;}
+  template<typename T2>
+  void set_i(const T2 &i) { _i = i;}
+
+  // const binary operators are global rather than members because of compiler errors due to ambiguity
+  // (would appear as a compiler bug)
+
+  template<typename T2>
+  ac_complex &operator +=(const ac_complex<T2> &op2) {
+    _r += op2.r();
+    _i += op2.i();
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator +=(const T2 &op2) {
+    _r += op2;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator -=(const ac_complex<T2> &op2) {
+    _r -= op2.r();
+    _i -= op2.i();
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator -=(const T2 &op2) {
+    _r -= op2;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator *=(const ac_complex<T2> &op2) {
+    T r0 = _r*op2.r() - _i*op2.i();
+    _i = _r*op2.i() + _i*op2.r();
+    _r = r0;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator *=(const T2 &op2) {
+    _r = _r*op2;
+    _i = _i*op2;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator /=(const ac_complex<T2> &op2) {
+    typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
+    T r0 = (_r*op2.r() + _i*op2.i())/d;
+    _i = (_i*op2.r() - _r*op2.i())/d;
+    _r = r0;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator /=(const T2 &op2) {
+    _r = _r/op2;
+    _i = _i/op2;
+    return *this;
+  }
+
+  // Arithmetic Unary --------------------------------------------------------
+  ac_complex operator +() {
+    return *this;
+  }
+  typename rt_unary::neg operator -() const {
+    typename rt_unary::neg res(-_r, -_i);
+    return res;
+  }
+
+  // ! ------------------------------------------------------------------------
+  bool operator ! () const {
+    return !_r && !_i;
+  }
+
+  typename rt_unary::neg conj() const {
+    typename rt_unary::neg res(_r, -_i);
+    return res;
+  }
+
+  typename rt_unary::mag_sqr mag_sqr() const {
+    return _r*_r + _i*_i;
+  }
+
+  ac_complex< ac_int<2,true> > sign_conj() const {
+    return ac_complex< ac_int<2,true> >(
+      _r ? (_r < 0 ? -1 : 1) : 0,
+      _i ? (_i < 0 ? 1 : -1) : 0
+    );
+  }
+
+  inline static std::string type_name() {
+    typedef typename ac_private::map<T>::t map_T;
+    std::string r = "ac_complex<";
+    r += map_T::type_name();
+    r += '>';
+    return r;
+  }
+
+};
+
+namespace ac_private {
+  // with T2 == ac_complex
+  template<typename T2>
+  struct rt_ac_complex_T< ac_complex<T2> > {
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename ac::rt_2T<T,T2>::plus> plus;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::minus> minus;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::minus2> minus2;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::logic> logic;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::div> div;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::div2> div2;
+      typedef ac_complex<typename ac::rt_2T<
+          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::plus,
+          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::minus
+        >::logic> mult;
+    };
+  };
+  // with T2 == ac_float
+  template< AC_FL_T0(2) >
+  struct rt_ac_complex_T< AC_FL0(2) > {
+    typedef AC_FL0(2) T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+  // with T2 == ac_fixed
+  template<int W2, int I2, bool S2>
+  struct rt_ac_complex_T< ac_fixed<W2,I2,S2> > {
+    typedef ac_fixed<W2,I2,S2> T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+  // with T2 == ac_int
+  template<int W2, bool S2>
+  struct rt_ac_complex_T< ac_int<W2,S2> > {
+    typedef ac_int<W2,S2> T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+  // with T2 == c_type<TC>
+  template<typename TC>
+  struct rt_ac_complex_T< c_type<TC> > {
+    typedef c_type<TC> T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus operator +(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus res( op.r() + op2.r(), op.i() + op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::plus operator +(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::template rt_T<T>::plus res( op + op2.r(), op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::plus operator +(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::plus res( op.r() + op2, op.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus operator -(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus res( op.r() - op2.r(), op.i() - op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::minus2 operator -(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::template rt_T<T>::minus2 res( op - op2.r(), -op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::minus operator -(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::minus res( op.r() - op2, op.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult operator *(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult res( op.r()*op2.r() - op.i()*op2.i(), op.i()*op2.r() + op.r()*op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::mult operator *(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::template rt_T<T>::mult res( op*op2.r(), op*op2.i());
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::mult operator *(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::mult res( op.r()*op2, op.i()*op2 );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::div operator /(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::div res((op.r()*op2.r() + op.i()*op2.i())/d, (op.i()*op2.r() - op.r()*op2.i())/d);
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::div operator /(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::div res( op.r()/op2, op.i()/op2 );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::div2 operator /(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
+  typename ac_complex<T2>::template rt_T<T>::div2 res(op*op2.r()/d, - op*op2.i()/d);
+  return res;
+}
+
+template<typename T, typename T2>
+inline bool operator == (const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  return op.r() == op2.r() && op.i() == op2.i();
+}
+
+template<typename T, typename T2>
+inline bool operator == (const T &op, const ac_complex<T2> &op2) {
+  return op == op2.r() && op2.i() == 0;
+}
+
+template<typename T, typename T2>
+inline bool operator == (const ac_complex<T> &op, const T2 &op2) {
+  return op.r() == op2 && op.i() == 0;
+}
+
+template<typename T, typename T2>
+inline bool operator != (const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  return op.r() != op2.r() || op.i() != op2.i();
+}
+
+template<typename T, typename T2>
+inline bool operator != (const T &op, const ac_complex<T2> &op2) {
+  return op != op2.r() || op2.i() != 0;
+}
+
+template<typename T, typename T2>
+inline bool operator != (const ac_complex<T> &op, const T2 &op2) {
+  return op.r() != op2 || op.i() != 0;
+}
+
+// Stream --------------------------------------------------------------------
+
+template<typename T>
+inline std::ostream& operator << (std::ostream &os, const ac_complex<T> &x) {
+#ifndef __SYNTHESIS__
+  os << "(" << x.r() << ", " << x.i() << ")";
+#endif
+  return os;
+}
+
+template<ac_special_val V, typename T>
+inline ac_complex<T> value(ac_complex<T>) {
+  T val = value<V>((T) 0);
+  ac_complex<T> r(val, val);
+  return r;
+}
+
+namespace ac {
+  template<ac_special_val V, typename T>
+  inline bool init_array(ac_complex<T> *a, int n) {
+    T val = value<V>((T) 0);
+    ac_complex<T> t(val, val);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+}
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_COMPLEX_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_fixed.h b/hls4ml/templates/quartus/ac_types/ac_fixed.h
index cb95db8d16..458cbddee6 100644
--- a/hls4ml/templates/quartus/ac_types/ac_fixed.h
+++ b/hls4ml/templates/quartus/ac_types/ac_fixed.h
@@ -1,1546 +1,1546 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2005-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_fixed.h
-//  Description:    class for fixed point operation handling in C++
-//  Author:         Andres Takach, Ph.D.
-*/
-
-#ifndef __AC_FIXED_H
-#define __AC_FIXED_H
-
-#include "ac_int.h"
-
-#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
-#error GCC version 3 or greater is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
-#error Microsoft Visual Studio 8 or newer is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4127 4308 4365 4514 4800 )
-#endif
-
-#ifndef __SYNTHESIS__
-#ifndef __AC_FIXED_UTILITY_BASE
-#define __AC_FIXED_UTILITY_BASE
-#endif
-
-#endif
-
-#ifdef __SYNTHESIS__
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-#undef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-#endif
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-namespace ac_private {
-  template<typename T>
-  struct rt_ac_fixed_T {
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::mult mult;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::plus plus;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus2 minus;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus minus2;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::logic logic;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div2 div;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div div2;
-    };
-  };
-  // specializations after definition of ac_fixed
-}
-
-namespace ac {
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  class basic_num_ovf_base {
-    bool d_enable;
-  public:
-    basic_num_ovf_base() : d_enable(true) {}
-    void enable_ovf(bool a) { d_enable = a; }
-    bool is_enabled() const { return d_enable; }
-    template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-    void update(bool overflow, bool neg, const basic_num_ovf_base<W2,I2,S2,Q2,O2> &op2) {
-#ifndef __AC_OVERRIDE_OVF_UPDATE_BODY
-      if(d_enable) {
-        if(overflow) {
-          std::cerr << (neg ? "-" : "+") << "OVF: ";
-          std::cerr << type_name() << " ( " << basic_num_ovf_base<W2,I2,S2,Q2,O2>::type_name(); 
-          std::cerr << " ( " << op2.value().to_double() << " ) )" << std::endl;
-        }
-      }
-#else
-      __AC_OVERRIDE_OVF_UPDATE_BODY
-#endif
-    }
-    void update(bool overflow, bool neg, double op2) {
-#ifndef __AC_OVERRIDE_OVF_UPDATE2_BODY
-      if(d_enable) {
-        if(overflow) {
-          std::cerr << (neg ? "-" : "+") << "OVF: ";
-          std::cerr << type_name() << " ( " << "double"; 
-          std::cerr << " ( " << op2 << " ) )" << std::endl;
-        }
-      }
-#else
-      __AC_OVERRIDE_OVF_UPDATE2_BODY
-#endif
-    }
-    const ac_fixed<W,I,S,Q,O> &value() const;
-    static std::string type_name();
-  };
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//  ac_fixed
-//////////////////////////////////////////////////////////////////////////////
-
-//enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
-//enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
-
-template<int W, int I, bool S=true, ac_q_mode Q=AC_TRN, ac_o_mode O=AC_WRAP>
-class ac_fixed : private ac_private::iv<(W+31+!S)/32>
-#ifndef __SYNTHESIS__
-__AC_FIXED_UTILITY_BASE
-#endif
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-, public __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-#endif
-{
-#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-#pragma builtin
-#endif
-
-  enum {N=(W+31+!S)/32};
-
-  template<int W2>
-  struct rt_priv {
-    enum {w_shiftl = AC_MAX(W+W2,1) };
-    typedef ac_fixed<w_shiftl, I, S> shiftl;
-  };
-
-  typedef ac_private::iv<N> Base;
-
-  inline void bit_adjust() {
-    const unsigned rem = (32-W)&31;
-    Base::v[N-1] =  S ? ((signed) ((unsigned)Base::v[N-1]  << rem) >> rem) : (rem ?
-                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
-  }
-  inline Base &base() { return *this; }
-  inline const Base &base() const { return *this; }
-
-  inline void overflow_adjust(bool overflow, bool neg) {
-    if(O==AC_WRAP) {
-      bit_adjust();
-      return;
-    }
-    else if(O==AC_SAT_ZERO) {
-      if(overflow)
-        ac_private::iv_extend<N>(Base::v, 0);
-      else
-        bit_adjust();
-    }
-    else if(S) {
-      if(overflow) {
-        if(!neg) {
-          ac_private::iv_extend<N-1>(Base::v, ~0);
-          Base::v[N-1] = ~((unsigned)~0 << ((W-1)&31));
-        } else {
-          ac_private::iv_extend<N-1>(Base::v, 0);
-          Base::v[N-1] = ((unsigned)~0 << ((W-1)&31));
-          if(O==AC_SAT_SYM)
-            Base::v[0] |= 1;
-        }
-      } else
-        bit_adjust();
-    }
-    else {
-      if(overflow) {
-        if(!neg) {
-          ac_private::iv_extend<N-1>(Base::v, ~0);
-          Base::v[N-1] = ~((unsigned)~0 << (W&31));
-        } else
-          ac_private::iv_extend<N>(Base::v, 0);
-      } else
-        bit_adjust();
-    }
-  }
-
-  inline bool quantization_adjust(bool qb, bool r, bool s) {
-    if(Q==AC_TRN)
-      return false;
-    if(Q==AC_RND_ZERO)
-      qb &= s || r;
-    else if(Q==AC_RND_MIN_INF)
-      qb &= r;
-    else if(Q==AC_RND_INF)
-      qb &= !s || r;
-    else if(Q==AC_RND_CONV)
-      qb &= (Base::v[0] & 1) || r;
-    else if(Q==AC_RND_CONV_ODD)
-      qb &= (!(Base::v[0] & 1)) || r;
-    else if(Q==AC_TRN_ZERO)
-      qb = s && ( qb || r );
-    return ac_private::iv_uadd_carry<N>(Base::v, qb, Base::v);
-  }
-
-  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
-
-public:
-  static const int width = W;
-  static const int i_width = I;
-  static const bool sign = S;
-  static const ac_o_mode o_mode = O;
-  static const ac_q_mode q_mode = Q;
-  static const int e_width = 0;
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-  static const bool compute_overflow_for_wrap = true;
-#else
-  static const bool compute_overflow_for_wrap = false;
-#endif
-
-  template<int W2, int I2, bool S2>
-  struct rt {
-    enum {
-      F=W-I,
-      F2=W2-I2,
-      mult_w = W+W2,
-      mult_i = I+I2,
-      mult_s = S||S2,
-      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      plus_s = S||S2,
-      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      minus_s = true,
-      div_w = W+AC_MAX(W2-I2,0)+S2,
-      div_i = I+(W2-I2)+S2,
-      div_s = S||S2,
-      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
-      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
-      logic_s = S||S2
-    };
-    typedef ac_fixed<mult_w, mult_i, mult_s> mult;
-    typedef ac_fixed<plus_w, plus_i, plus_s> plus;
-    typedef ac_fixed<minus_w, minus_i, minus_s> minus;
-    typedef ac_fixed<logic_w, logic_i, logic_s> logic;
-    typedef ac_fixed<div_w, div_i, div_s> div;
-    typedef ac_fixed<W, I, S> arg1;
-  };
-
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::mult mult;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::plus plus;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus minus;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus2 minus2;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::logic logic;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div div;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div2 div2;
-    typedef ac_fixed<W, I, S> arg1;
-  };
-
-  struct rt_unary {
-    enum {
-      neg_w = W+1,
-      neg_i = I+1,
-      neg_s = true,
-      mag_sqr_w = 2*W-S,
-      mag_sqr_i = 2*I-S,
-      mag_sqr_s = false,
-      mag_w = W+S,
-      mag_i = I+S,
-      mag_s = false,
-      leading_sign_w = ac::log2_ceil<W+!S>::val,
-      leading_sign_s = false
-    };
-    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
-    typedef ac_fixed<neg_w, neg_i, neg_s> neg;
-    typedef ac_fixed<mag_sqr_w, mag_sqr_i, mag_sqr_s> mag_sqr;
-    typedef ac_fixed<mag_w, mag_i, mag_s> mag;
-    template<unsigned N>
-    struct set {
-      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_s = S};
-      typedef ac_fixed<sum_w, sum_i, sum_s> sum;
-    };
-  };
-
-  ac_fixed(const ac_fixed &op): Base(op) { }
-
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
-  ac_fixed() {
-#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
-    bit_adjust();
-    if( O==AC_SAT_SYM && S && Base::v[N-1] < 0 && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true) )
-      Base::v[0] |= 1;
-#endif
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  inline ac_fixed (const ac_fixed<W2,I2,S2,Q2,O2> &op) {
-    enum {N2=(W2+31+!S2)/32, F=W-I, F2=W2-I2, QUAN_INC = F2>F && !(Q==AC_TRN || (Q==AC_TRN_ZERO && !S2)) };
-    bool carry = false;
-    // handle quantization
-    if(F2 == F)
-      Base::operator =(op);
-    else if(F2 > F) {
-      op.template const_shift_r<N,F2-F>(*this);
-//      ac_private::iv_const_shift_r<N2,N,F2-F>(op.v, Base::v);
-      if(Q!=AC_TRN && !(Q==AC_TRN_ZERO && !S2)) {
-        bool qb = (F2-F > W2) ? (op.v[N2-1] < 0) : (bool) op[F2-F-1];
-        bool r = (F2 > F+1) ? !ac_private::iv_equal_zeros_to<F2-F-1,N2>(op.v) : false;
-        carry = quantization_adjust(qb, r, S2 && op.v[N2-1] < 0);
-      }
-    }
-    else  // no quantization
-      op.template const_shift_l<N,F-F2>(*this);
-//      ac_private::iv_const_shift_l<N2,N,F-F2>(op.v, Base::v);
-    // handle overflow
-    if((O!=AC_WRAP || compute_overflow_for_wrap)
-       && ((!S && S2) || I-S < I2-S2+(QUAN_INC || (S2 && O==AC_SAT_SYM && (O2 != AC_SAT_SYM || F2 > F) )))
-    ) { // saturation
-      bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
-      bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
-      bool neg_src;
-      if(F2-F+32*N < W2) {
-        bool all_ones = ac_private::iv_equal_ones_from<F2-F+32*N,N2>(op.v);
-        deleted_bits_zero = deleted_bits_zero && (carry ? all_ones : ac_private::iv_equal_zeros_from<F2-F+32*N,N2>(op.v));
-        deleted_bits_one = deleted_bits_one && (carry ? ac_private::iv_equal_ones_from<1+F2-F+32*N,N2>(op.v) && !op[F2-F+32*N] : all_ones);
-        neg_src = S2 && op.v[N2-1] < 0 && !(carry & all_ones);
-      }
-      else
-        neg_src = S2 && op.v[N2-1] < 0 && Base::v[N-1] < 0;
-      bool neg_trg = S && (bool) this->operator[](W-1);
-      bool overflow = !neg_src && (neg_trg || !deleted_bits_zero);
-      overflow |= neg_src && (!neg_trg || !deleted_bits_one);
-      if(O==AC_SAT_SYM && S && S2)
-        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
-      overflow_adjust(overflow, neg_src);
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-    __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,op);
-#endif
-    }
-    else
-      bit_adjust();
-  }
-
-  template<int W2, bool S2>
-  inline ac_fixed (const ac_int<W2,S2> &op) {
-    ac_fixed<W2,W2,S2> f_op;
-    f_op.base().operator =(op);
-    *this = f_op;
-  }
-
-  template<int W2>
-  typename rt_priv<W2>::shiftl shiftl() const {
-    typedef typename rt_priv<W2>::shiftl shiftl_t;
-    shiftl_t r;
-    Base::template const_shift_l<shiftl_t::N,W2>(r);
-    return r;
-  }
-
-  inline ac_fixed( bool b ) { *this = (ac_int<1,false>) b; }
-  inline ac_fixed( char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_fixed( signed char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_fixed( unsigned char b ) { *this = (ac_int<8,false>) b; }
-  inline ac_fixed( signed short b ) { *this = (ac_int<16,true>) b; }
-  inline ac_fixed( unsigned short b ) { *this = (ac_int<16,false>) b; }
-  inline ac_fixed( signed int b ) { *this = (ac_int<32,true>) b; }
-  inline ac_fixed( unsigned int b ) { *this = (ac_int<32,false>) b; }
-  inline ac_fixed( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
-  inline ac_fixed( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
-  inline ac_fixed( Slong b ) { *this = (ac_int<64,true>) b; }
-  inline ac_fixed( Ulong b ) { *this = (ac_int<64,false>) b; }
-
-  inline ac_fixed( double d ) {
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool o, qb, r;
-    bool neg_src = d < 0;
-    Base::conv_from_fraction(di, &qb, &r, &o);
-    quantization_adjust(qb, r, neg_src);
-    // a neg number may become non neg (0) after quantization
-    neg_src &= o || Base::v[N-1] < 0;
-
-    if(O!=AC_WRAP || compute_overflow_for_wrap) { // saturation
-      bool overflow;
-      bool neg_trg = S && (bool) this->operator[](W-1);
-      if(o) {
-        overflow = true;
-      } else {
-        bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
-        bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
-        overflow = !neg_src && (neg_trg || !deleted_bits_zero);
-        overflow |= neg_src && (!neg_trg || !deleted_bits_one);
-      }
-      if(O==AC_SAT_SYM && S)
-        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
-      overflow_adjust(overflow, neg_src);
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-      __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,d);
-#endif
-    } else
-      bit_adjust();
-  }
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-  template<ac_special_val V>
-  inline ac_fixed &set_val() {
-    if(V == AC_VAL_DC) {
-      ac_fixed r;
-      Base::operator =(r);
-      bit_adjust();
-    }
-    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-      Base::operator =(0);
-      if(S && V == AC_VAL_MIN) {
-        const unsigned rem = (W-1)&31;
-        Base::v[N-1] = ((unsigned)~0 << rem);
-        if(O == AC_SAT_SYM) {
-          if(W == 1)
-            Base::v[0] = 0;
-          else
-            Base::v[0] |= 1;
-        }
-      } else if(V == AC_VAL_QUANTUM)
-        Base::v[0] = 1;
-    }
-    else {  // AC_VAL_MAX
-      Base::operator =(-1);
-      const unsigned int rem = (32-W - (unsigned) !S )&31;
-      Base::v[N-1] = ((unsigned) (-1) >> 1) >> rem;
-    }
-    return *this;
-  }
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-  // Explicit conversion functions to ac_int that captures all integer bits (bits are truncated)
-  inline ac_int<AC_MAX(I,1),S> to_ac_int() const { return ((ac_fixed<AC_MAX(I,1),AC_MAX(I,1),S>) *this).template slc<AC_MAX(I,1)>(0); }
-
-  // Explicit conversion functions to C built-in types -------------
-  inline int to_int() const { return ((I-W) >= 32) ? 0 : (signed int) to_ac_int(); }
-  inline unsigned to_uint() const { return ((I-W) >= 32) ? 0 : (unsigned int) to_ac_int(); }
-  inline long to_long() const { return ((I-W) >= ac_private::long_w) ? 0 : (signed long) to_ac_int(); }
-  inline unsigned long to_ulong() const { return ((I-W) >= ac_private::long_w) ? 0 : (unsigned long) to_ac_int(); }
-  inline Slong to_int64() const { return ((I-W) >= 64) ? 0 : (Slong) to_ac_int(); }
-  inline Ulong to_uint64() const { return ((I-W) >= 64) ? 0 : (Ulong) to_ac_int(); }
-  inline double to_double() const { return ac_private::ldexpr<I-W>(Base::to_double()); }
-
-  inline int length() const { return W; }
-
-  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
-    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
-    char r[(W-AC_MIN(AC_MIN(W-I,I),0)+31)/32*32+5] = {0};
-    int i = 0;
-    if(sign_mag)
-      r[i++] = is_neg() ? '-' : '+';
-    else if (base_rep == AC_DEC && is_neg())
-      r[i++] = '-';
-    if(base_rep != AC_DEC) {
-      r[i++] = '0';
-      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
-    }
-    ac_fixed<W+1, I+1, true> t;
-    if( (base_rep == AC_DEC || sign_mag) && is_neg() )
-      t = operator -();
-    else
-      t = *this;
-    ac_fixed<AC_MAX(I+1,1),AC_MAX(I+1,1),true> i_part = t;
-    ac_fixed<AC_MAX(W-I,1),0,false> f_part = t;
-    i += ac_private::to_string(i_part.v, AC_MAX(I+1,1), sign_mag, base_rep, false, r+i);
-    if(W-I > 0) {
-      r[i++] = '.';
-      if(!ac_private::to_string(f_part.v, W-I, false, base_rep, true, r+i))
-        r[--i] = 0;
-    }
-    if(!i) {
-      r[0] = '0';
-      r[1] = 0;
-    }
-    return std::string(r);
-  }
-  inline static std::string type_name() {
-    const char *tf[] = {"false", "true" };
-    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV", "AC_RND_CONV_ODD" };
-    const char *o[] = {"AC_WRAP", "AC_SAT", "AC_SAT_ZERO", "AC_SAT_SYM" };
-    std::string r = "ac_fixed<";
-    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
-    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
-    r += tf[S];
-    r += ',';
-    r += q[Q];
-    r += ',';
-    r += o[O];
-    r += '>';
-    return r;
-  }
-
-  // Arithmetic : Binary ----------------------------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::mult operator *( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    typename rt<W2,I2,S2>::mult r;
-    Base::mult(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::plus operator +( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::plus r;
-    if(F == F2)
-      Base::add(op2, r);
-    else if(F > F2)
-      Base::add(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().add(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::minus operator -( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::minus r;
-    if(F == F2)
-      Base::sub(op2, r);
-    else if(F > F2)
-      Base::sub(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().sub(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wenum-compare"
-#endif
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::div operator /( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    typename rt<W2,I2,S2>::div r;
-    enum { Num_w = W+AC_MAX(W2-I2,0), Num_i = I, Num_w_minus = Num_w+S, Num_i_minus = Num_i+S,
-          N1 = ac_fixed<Num_w,Num_i,S>::N, N1minus = ac_fixed<Num_w_minus,Num_i_minus,S>::N,
-          N2 = ac_fixed<W2,I2,S2>::N, N2minus = ac_fixed<W2+S2,I2+S2,S2>::N,
-          num_s = S + (N1minus > N1), den_s = S2 + (N2minus > N2), Nr = rt<W2,I2,S2>::div::N };
-    ac_fixed<Num_w, Num_i, S> t = *this;
-    t.template div<num_s, N2, den_s, Nr>(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-  // Arithmetic assign  ------------------------------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator *=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator *(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator +=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator +(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator -=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator -(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator /=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator /(op2);
-    return *this;
-  }
-  // increment/decrement by quantum (smallest difference that can be represented)
-  // Arithmetic prefix increment, decrement ---------------------------------
-  ac_fixed &operator ++() {
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator += (q);
-    return *this;
-  }
-  ac_fixed &operator --() {
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator -= (q);
-    return *this;
-  }
-  // Arithmetic postfix increment, decrement ---------------------------------
-  const ac_fixed operator ++(int) {
-    ac_fixed t = *this;
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator += (q);
-    return t;
-  }
-  const ac_fixed operator --(int) {
-    ac_fixed t = *this;
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator -= (q);
-    return t;
-  }
-  // Arithmetic Unary --------------------------------------------------------
-  ac_fixed operator +() {
-    return *this;
-  }
-  typename rt_unary::neg operator -() const {
-    typename rt_unary::neg r;
-    Base::neg(r);
-    r.bit_adjust();
-    return r;
-  }
-  // ! ------------------------------------------------------------------------
-  bool operator ! () const {
-    return Base::equal_zero();
-  }
-
-  // Bitwise (arithmetic) unary: complement  -----------------------------
-  ac_fixed<W+!S, I+!S, true> operator ~() const {
-    ac_fixed<W+!S, I+!S, true> r;
-    Base::bitwise_complement(r);
-    return r;
-  }
-  // Bitwise (not arithmetic) bit complement  -----------------------------
-  ac_fixed<W, I, false> bit_complement() const {
-    ac_fixed<W, I, false> r;
-    Base::bitwise_complement(r);
-    r.bit_adjust();
-    return r;
-  }
-  // Bitwise (not arithmetic): and, or, xor ----------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::logic operator &( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::logic r;
-    if(F == F2)
-      Base::bitwise_and(op2, r);
-    else if(F > F2)
-      Base::bitwise_and(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().bitwise_and(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::logic operator |( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::logic r;
-    if(F == F2)
-      Base::bitwise_or(op2, r);
-    else if(F > F2)
-      Base::bitwise_or(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().bitwise_or(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::logic operator ^( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::logic r;
-    if(F == F2)
-      Base::bitwise_xor(op2, r);
-    else if(F > F2)
-      Base::bitwise_xor(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().bitwise_xor(op2, r);
-    return r;
-  }
-  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator &= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
-    *this = this->operator &(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator |= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
-    *this = this->operator |(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator ^= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
-    *this = this->operator ^(op2);
-    return *this;
-  }
-  // Shift (result constrained by left operand) -------------------------------
-  template<int W2>
-  ac_fixed operator << ( const ac_int<W2,true> &op2 ) const {
-    // currently not written to overflow or quantize (neg shift)
-    ac_fixed r;
-    Base::shift_l2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_fixed operator << ( const ac_int<W2,false> &op2 ) const {
-    // currently not written to overflow
-    ac_fixed r;
-    Base::shift_l(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_fixed operator >> ( const ac_int<W2,true> &op2 ) const {
-    // currently not written to quantize or overflow (neg shift)
-    ac_fixed r;
-    Base::shift_r2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_fixed operator >> ( const ac_int<W2,false> &op2 ) const {
-    // currently not written to quantize
-    ac_fixed r;
-    Base::shift_r(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  // Shift assign ------------------------------------------------------------
-  template<int W2>
-  ac_fixed operator <<= ( const ac_int<W2,true> &op2 ) {
-    // currently not written to overflow or quantize (neg shift)
-    Base r;
-    Base::shift_l2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_fixed operator <<= ( const ac_int<W2,false> &op2 ) {
-    // currently not written to overflow
-    Base r;
-    Base::shift_l(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_fixed operator >>= ( const ac_int<W2,true> &op2 ) {
-    // currently not written to quantize or overflow (neg shift)
-    Base r;
-    Base::shift_r2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_fixed operator >>= ( const ac_int<W2,false> &op2 ) {
-    // currently not written to quantize
-    Base r;
-    Base::shift_r(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  // Relational ---------------------------------------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator == ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return Base::equal(op2);
-    else if(F > F2)
-      return Base::equal(op2.template shiftl<F-F2>());
-    else
-      return shiftl<F2-F>().equal(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator != ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return ! Base::equal(op2);
-    else if(F > F2)
-      return ! Base::equal(op2.template shiftl<F-F2>());
-    else
-      return ! shiftl<F2-F>().equal(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator < ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return Base::less_than(op2);
-    else if(F > F2)
-      return Base::less_than(op2.template shiftl<F-F2>());
-    else
-      return shiftl<F2-F>().less_than(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator >= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return ! Base::less_than(op2);
-    else if(F > F2)
-      return ! Base::less_than(op2.template shiftl<F-F2>());
-    else
-      return ! shiftl<F2-F>().less_than(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator > ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return Base::greater_than(op2);
-    else if(F > F2)
-      return Base::greater_than(op2.template shiftl<F-F2>());
-    else
-      return shiftl<F2-F>().greater_than(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator <= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return ! Base::greater_than(op2);
-    else if(F > F2)
-      return ! Base::greater_than(op2.template shiftl<F-F2>());
-    else
-      return ! shiftl<F2-F>().greater_than(op2);
-  }
-  bool operator == ( double d) const {
-    if(is_neg() != (d < 0.0))
-      return false;
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool overflow, qb, r;
-    ac_fixed<W,I,S> t;
-    t.conv_from_fraction(di, &qb, &r, &overflow);
-    if(qb || r || overflow)
-      return false;
-    return operator == (t);
-  }
-  bool operator != ( double d) const {
-    return !operator == ( d );
-  }
-  bool operator < ( double d) const {
-    if(is_neg() != (d < 0.0))
-      return is_neg();
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool overflow, qb, r;
-    ac_fixed<W,I,S> t;
-    t.conv_from_fraction(di, &qb, &r, &overflow);
-    if(is_neg() && overflow)
-      return false;
-    return (!is_neg() && overflow) || ((qb || r) && operator <= (t)) || operator < (t);
-  }
-  bool operator >= ( double d) const {
-    return !operator < ( d );
-  }
-  bool operator > ( double d) const {
-    if(is_neg() != (d < 0.0))
-      return !is_neg();
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool overflow, qb, r;
-    ac_fixed<W,I,S> t;
-    t.conv_from_fraction(di, &qb, &r, &overflow);
-    if(!is_neg() && overflow )
-      return false;
-    return (is_neg() && overflow) || operator > (t);
-  }
-  bool operator <= ( double d) const {
-    return !operator > ( d );
-  }
-
-  // Bit and Slice Select -----------------------------------------------------
-  template<int WS, int WX, bool SX>
-  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int WS>
-  inline const ac_int<WS,S> slc(signed index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int WS>
-  inline const ac_int<WS,S> slc(unsigned uindex) const {
-    ac_int<WS,S> r;
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int W2, bool S2, int WX, bool SX>
-  inline ac_fixed &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_fixed &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_fixed &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-
-  template<int Msb, int Lsb>
-  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
-    #if __cplusplus > 199711L
-    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
-    static_assert(Lsb >= 0, "LSB is negative");
-    static_assert(Msb < W, "MSB >= W");
-    #endif
-    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
-  }
-
-  class ac_bitref {
-# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-# pragma builtin
-# endif
-    ac_fixed &d_bv;
-    unsigned d_index;
-  public:
-    ac_bitref( ac_fixed *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
-    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
-
-    inline ac_bitref operator = ( int val ) {
-      // lsb of int (val&1) is written to bit
-      if(d_index < W) {
-        int *pval = &d_bv.v[d_index>>5];
-        *pval ^= (*pval ^ ((unsigned) val << (d_index&31) )) & 1 << (d_index&31);
-        d_bv.bit_adjust();   // in case sign bit was assigned
-      }
-      return *this;
-    }
-    template<int W2, bool S2>
-    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
-      return operator =(val.to_int());
-    }
-    inline ac_bitref operator = ( const ac_bitref &val ) {
-      return operator =((int) (bool) val);
-    }
-  };
-
-  ac_bitref operator [] ( unsigned int uindex) {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  ac_bitref operator [] ( int index) {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  template<int W2, bool S2>
-  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-
-  bool operator [] ( unsigned int uindex) const {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  bool operator [] ( int index) const {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  template<int W2, bool S2>
-  bool operator [] ( const ac_int<W2,S2> &index) const {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  typename rt_unary::leading_sign leading_sign() const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    return ls;
-  }
-  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    all_sign = (ls == W-S);
-    return ls;
-  }
-  // returns false if number is denormal
-  template<int WE, bool SE>
-  bool normalize(ac_int<WE,SE> &exp) {
-    ac_int<W,S> m = this->template slc<W>(0);
-    bool r = m.normalize(exp);
-    this->set_slc(0,m);
-    return r;
-  }
-  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
-  template<int WE, bool SE>
-  bool normalize_RME(ac_int<WE,SE> &exp) {
-    ac_int<W,S> m = this->template slc<W>(0);
-    bool r = m.normalize_RME(exp);
-    this->set_slc(0,m);
-    return r;
-  }
-  inline void bit_fill_hex(const char *str) {
-    // Zero Pads if str is too short, throws ms bits away if str is too long
-    // Asserts if anything other than 0-9a-fA-F is encountered
-    ac_int<W,S> x;
-    x.bit_fill_hex(str);
-    set_slc(0, x);
-  }
-  template<int N>
-  inline void bit_fill(const int (&ivec)[N], bool bigendian=true) {
-    // bit_fill from integer vector
-    //   if W > N*32, missing most significant bits are zeroed
-    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
-    //
-    // Example:
-    //   ac_fixed<80,40,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
-    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
-    ac_int<W,S> x;
-    x.bit_fill(ivec, bigendian);
-    set_slc(0, x);
-  }
-};
-
-namespace ac {
-  template<typename T>
-  struct ac_fixed_represent {
-    enum { t_w = ac_private::c_type_params<T>::W, t_i = t_w, t_s = ac_private::c_type_params<T>::S };
-    typedef ac_fixed<t_w,t_i,t_s> type;
-  };
-  template<> struct ac_fixed_represent<float> {};
-  template<> struct ac_fixed_represent<double> {};
-  template<int W, bool S>
-  struct ac_fixed_represent< ac_int<W,S> > {
-    typedef ac_fixed<W,W,S> type;
-  };
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  struct ac_fixed_represent< ac_fixed<W,I,S,Q,O> > {
-    typedef ac_fixed<W,I,S,Q,O> type;
-  };
-}
-
-namespace ac_private {
-  // with T == ac_fixed
-  template<int W2, int I2, bool S2>
-  struct rt_ac_fixed_T< ac_fixed<W2,I2,S2> > {
-    typedef ac_fixed<W2,I2,S2> fx2_t;
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef ac_fixed<W,I,S> fx_t;
-      typedef typename fx_t::template rt<W2,I2,S2>::mult mult;
-      typedef typename fx_t::template rt<W2,I2,S2>::plus plus;
-      typedef typename fx_t::template rt<W2,I2,S2>::minus minus;
-      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
-      typedef typename fx_t::template rt<W2,I2,S2>::logic logic;
-      typedef typename fx_t::template rt<W2,I2,S2>::div div;
-      typedef typename fx2_t::template rt<W,I,S>::div div2;
-    };
-  };
-  // with T == ac_int
-  template<int W2, bool S2>
-  struct rt_ac_fixed_T< ac_int<W2,S2> > {
-    typedef ac_fixed<W2,W2,S2> fx2_t;
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef ac_fixed<W,I,S> fx_t;
-      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
-      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
-      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
-      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
-      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
-      typedef typename fx_t::template rt<W2,W2,S2>::div div;
-      typedef typename fx2_t::template rt<W,I,S>::div div2;
-    };
-  };
-
-  template<typename T>
-  struct rt_ac_fixed_T< c_type<T> > {
-    typedef typename ac::ac_fixed_represent<T>::type fx2_t;
-    enum { W2 = fx2_t::width, I2 = W2, S2 = fx2_t::sign };
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef ac_fixed<W,I,S> fx_t;
-      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
-      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
-      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
-      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
-      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
-      typedef typename fx_t::template rt<W2,W2,S2>::div div;
-      typedef typename fx2_t::template rt<W,I,S>::div div2;
-    };
-  };
-}
-
-
-// Specializations for constructors on integers that bypass bit adjusting
-//  and are therefore more efficient
-template<> inline ac_fixed<1,1,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b ? -1 : 0; }
-
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed long b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned long b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b&1; }
-
-template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = (signed char) b; }
-template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned char) b; }
-
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned short) b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = (signed short) b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = (unsigned short) b; }
-
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; }
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; }
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; v[1] = 0;}
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; v[1] = 0;}
-
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; }
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; }
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = 0;}
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = 0;}
-
-template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
-template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
-template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
-template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
-
-
-// Stream --------------------------------------------------------------------
-
-template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline std::ostream& operator << (std::ostream &os, const ac_fixed<W,I,S,Q,O> &x) {
-#ifndef __SYNTHESIS__
-  if ((os.flags() & std::ios::hex) != 0) {
-    os << x.to_string(AC_HEX);
-  } else if ((os.flags() & std::ios::oct) != 0) {
-    os << x.to_string(AC_OCT);
-  } else {
-    os << x.to_string(AC_DEC);
-  }
-#endif
-  return os;
-}
-
-
-// Macros for Binary Operators with C Integers --------------------------------------------
-
-#define FX_BIN_OP_WITH_INT_2I(BIN_OP, C_TYPE, WI, SI)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline ac_fixed<W,I,S,Q,O> operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
-    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
-  }
-
-#define FX_BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
-  } \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
-    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
-  }
-
-#define FX_REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
-    return op.operator REL_OP (ac_fixed<W2,W2,S2>(op2));  \
-  }  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline bool operator REL_OP ( C_TYPE op2, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<W2,W2,S2>(op2).operator REL_OP (op);  \
-  }
-
-#define FX_ASSIGN_OP_WITH_INT_2(ASSIGN_OP, C_TYPE, W2, S2)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
-    return op.operator ASSIGN_OP (ac_fixed<W2,W2,S2>(op2));  \
-  }
-
-#define FX_ASSIGN_OP_WITH_INT_2I(ASSIGN_OP, C_TYPE, W2, S2)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline ac_fixed<W,I,S> operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
-    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
-  }
-
-#define FX_OPS_WITH_INT(C_TYPE, WI, SI) \
-  FX_BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
-  FX_BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
-  FX_BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
-  FX_BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
-  FX_BIN_OP_WITH_INT_2I(>>, C_TYPE, WI, SI) \
-  FX_BIN_OP_WITH_INT_2I(<<, C_TYPE, WI, SI) \
-  FX_BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
-  FX_BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
-  FX_BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
-  \
-  FX_REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
-  \
-  FX_ASSIGN_OP_WITH_INT_2(+=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(-=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(*=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(/=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2I(>>=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2I(<<=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(&=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(|=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(^=, C_TYPE, WI, SI)
-
-// --------------------------------------- End of Macros for Binary Operators with C Integers
-
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-namespace ac {
-  namespace ops_with_other_types {
-#endif
-    // Binary Operators with C Integers --------------------------------------------
-    FX_OPS_WITH_INT(bool, 1, false)
-    FX_OPS_WITH_INT(char, 8, true)
-    FX_OPS_WITH_INT(signed char, 8, true)
-    FX_OPS_WITH_INT(unsigned char, 8, false)
-    FX_OPS_WITH_INT(short, 16, true)
-    FX_OPS_WITH_INT(unsigned short, 16, false)
-    FX_OPS_WITH_INT(int, 32, true)
-    FX_OPS_WITH_INT(unsigned int, 32, false)
-    FX_OPS_WITH_INT(long, ac_private::long_w, true)
-    FX_OPS_WITH_INT(unsigned long, ac_private::long_w, false)
-    FX_OPS_WITH_INT(Slong, 64, true)
-    FX_OPS_WITH_INT(Ulong, 64, false)
-    // -------------------------------------- End of Binary Operators with Integers
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-  }  // ops_with_other_types namespace
-} // ac namespace
-#endif
-
-
-// Macros for Binary Operators with ac_int --------------------------------------------
-
-#define FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
-  }
-
-#define FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &i_op) {  \
-    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
-  }
-
-#define FX_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
-  FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
-  FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
-
-#define FX_REL_OP_WITH_AC_INT(REL_OP)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
-    return op.operator REL_OP (ac_fixed<WI,WI,SI>(op2));  \
-  }  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<WI,WI,SI>(op2).operator REL_OP (op);  \
-  }
-
-#define FX_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
-    return op.operator ASSIGN_OP (ac_fixed<WI,WI,SI>(op2));  \
-  }  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline ac_int<WI,SI> &operator ASSIGN_OP ( ac_int<WI,SI> &op, const ac_fixed<W,I,S,Q,O> &op2) {  \
-    return op.operator ASSIGN_OP (op2.to_ac_int());  \
-  }
-
-// -------------------------------------------- End of Macros for Binary Operators with ac_int
-
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-namespace ac {
-  namespace ops_with_other_types {
-#endif
-    // Binary Operators with ac_int --------------------------------------------
-    FX_BIN_OP_WITH_AC_INT(*, mult)
-    FX_BIN_OP_WITH_AC_INT(+, plus)
-    FX_BIN_OP_WITH_AC_INT(-, minus)
-    FX_BIN_OP_WITH_AC_INT(/, div)
-    FX_BIN_OP_WITH_AC_INT(&, logic)
-    FX_BIN_OP_WITH_AC_INT(|, logic)
-    FX_BIN_OP_WITH_AC_INT(^, logic)
-
-    FX_REL_OP_WITH_AC_INT(==)
-    FX_REL_OP_WITH_AC_INT(!=)
-    FX_REL_OP_WITH_AC_INT(>)
-    FX_REL_OP_WITH_AC_INT(>=)
-    FX_REL_OP_WITH_AC_INT(<)
-    FX_REL_OP_WITH_AC_INT(<=)
-
-    FX_ASSIGN_OP_WITH_AC_INT(+=)
-    FX_ASSIGN_OP_WITH_AC_INT(-=)
-    FX_ASSIGN_OP_WITH_AC_INT(*=)
-    FX_ASSIGN_OP_WITH_AC_INT(/=)
-    FX_ASSIGN_OP_WITH_AC_INT(&=)
-    FX_ASSIGN_OP_WITH_AC_INT(|=)
-    FX_ASSIGN_OP_WITH_AC_INT(^=)
-    // -------------------------------------- End of Binary Operators with ac_int
-
-    // Relational Operators with double --------------------------------------
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator == ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator == (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator != ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator != (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator > ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator < (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator < ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator > (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator <= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator >= (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator >= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator <= (op);
-    }
-    // -------------------------------------- End of Relational Operators with double
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-  }  // ops_with_other_types namespace
-} // ac namespace
-using namespace ac::ops_with_other_types;
-#endif
-
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-
-// Global templatized functions for easy initialization to special values
-template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>) {
-  ac_fixed<W,I,S> r;
-  return r.template set_val<V>();
-}
-
-namespace ac {
-// PUBLIC FUNCTIONS
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  inline bool init_array(ac_fixed<W,I,S,Q,O> *a, int n) {
-    ac_fixed<W,I,S> t;
-    t.template set_val<V>();
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-
-  inline ac_fixed<54,2,true> frexp_d(double d, ac_int<11,true> &exp) {
-    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!d) {
-      exp = 0;
-      return 0;
-    }
-    int exp_i;
-    double f0 = frexp(d, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i > Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
-    exp = exp_i + rshift;
-    ac_int<Mant_W+2,true> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
-    ac_fixed<Mant_W+2,2,true> r;
-    r.set_slc(0, f_i);
-    return r;
-  }
-  inline ac_fixed<25,2,true> frexp_f(float f, ac_int<8,true> &exp) {
-    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!f) {
-      exp = 0;
-      return 0;
-    }
-    int exp_i;
-    float f0 = frexpf(f, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i >= Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
-    exp = exp_i + rshift;
-    ac_int<Mant_W+2,true> f_i = f0 * (1 << (Mant_W + 1 - rshift));
-    ac_fixed<Mant_W+2,2,true> r;
-    r.set_slc(0, f_i);
-    return r;
-  }
-
-  inline ac_fixed<53,1,false> frexp_sm_d(double d, ac_int<11,true> &exp, bool &sign) {
-    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!d) {
-      exp = 0;
-      sign = false;
-      return 0;
-    }
-    int exp_i;
-    bool s = d < 0;
-    double f0 = frexp(s ? -d : d, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
-    exp = exp_i + rshift;
-    ac_int<Mant_W+1,false> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
-    ac_fixed<Mant_W+1,1,false> r;
-    r.set_slc(0, f_i);
-    sign = s;
-    return r;
-  }
-  inline ac_fixed<24,1,false> frexp_sm_f(float f, ac_int<8,true> &exp, bool &sign) {
-    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!f) {
-      exp = 0;
-      sign = false;
-      return 0;
-    }
-    int exp_i;
-    bool s = f < 0;
-    float f0 = frexp(s ? -f : f, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
-    exp = exp_i + rshift;
-    ac_int<24,false> f_i = f0 * (1 << (Mant_W + 1 - rshift));
-    ac_fixed<24,1,false> r;
-    r.set_slc(0, f_i);
-    sign = s;
-    return r;
-  }
-
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  const ac_fixed<W,I,S,Q,O> &basic_num_ovf_base<W,I,S,Q,O>::value() const {
-    return (const ac_fixed<W,I,S,Q,O> &) *this;
-  }
-
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> std::string basic_num_ovf_base<W,I,S,Q,O>::type_name() {
-    return ac_fixed<W,I,S,Q,O>::type_name();
-  }
-}
-
-
-///////////////////////////////////////////////////////////////////////////////
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_FIXED_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2005-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_fixed.h
+//  Description:    class for fixed point operation handling in C++
+//  Author:         Andres Takach, Ph.D.
+*/
+
+#ifndef __AC_FIXED_H
+#define __AC_FIXED_H
+
+#include "ac_int.h"
+
+#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
+#error GCC version 3 or greater is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
+#error Microsoft Visual Studio 8 or newer is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4127 4308 4365 4514 4800 )
+#endif
+
+#ifndef __SYNTHESIS__
+#ifndef __AC_FIXED_UTILITY_BASE
+#define __AC_FIXED_UTILITY_BASE
+#endif
+
+#endif
+
+#ifdef __SYNTHESIS__
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+#undef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+#endif
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+namespace ac_private {
+  template<typename T>
+  struct rt_ac_fixed_T {
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::mult mult;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::plus plus;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus2 minus;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus minus2;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::logic logic;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div2 div;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div div2;
+    };
+  };
+  // specializations after definition of ac_fixed
+}
+
+namespace ac {
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  class basic_num_ovf_base {
+    bool d_enable;
+  public:
+    basic_num_ovf_base() : d_enable(true) {}
+    void enable_ovf(bool a) { d_enable = a; }
+    bool is_enabled() const { return d_enable; }
+    template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+    void update(bool overflow, bool neg, const basic_num_ovf_base<W2,I2,S2,Q2,O2> &op2) {
+#ifndef __AC_OVERRIDE_OVF_UPDATE_BODY
+      if(d_enable) {
+        if(overflow) {
+          std::cerr << (neg ? "-" : "+") << "OVF: ";
+          std::cerr << type_name() << " ( " << basic_num_ovf_base<W2,I2,S2,Q2,O2>::type_name(); 
+          std::cerr << " ( " << op2.value().to_double() << " ) )" << std::endl;
+        }
+      }
+#else
+      __AC_OVERRIDE_OVF_UPDATE_BODY
+#endif
+    }
+    void update(bool overflow, bool neg, double op2) {
+#ifndef __AC_OVERRIDE_OVF_UPDATE2_BODY
+      if(d_enable) {
+        if(overflow) {
+          std::cerr << (neg ? "-" : "+") << "OVF: ";
+          std::cerr << type_name() << " ( " << "double"; 
+          std::cerr << " ( " << op2 << " ) )" << std::endl;
+        }
+      }
+#else
+      __AC_OVERRIDE_OVF_UPDATE2_BODY
+#endif
+    }
+    const ac_fixed<W,I,S,Q,O> &value() const;
+    static std::string type_name();
+  };
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//  ac_fixed
+//////////////////////////////////////////////////////////////////////////////
+
+//enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
+//enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
+
+template<int W, int I, bool S=true, ac_q_mode Q=AC_TRN, ac_o_mode O=AC_WRAP>
+class ac_fixed : private ac_private::iv<(W+31+!S)/32>
+#ifndef __SYNTHESIS__
+__AC_FIXED_UTILITY_BASE
+#endif
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+, public __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+#endif
+{
+#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+#pragma builtin
+#endif
+
+  enum {N=(W+31+!S)/32};
+
+  template<int W2>
+  struct rt_priv {
+    enum {w_shiftl = AC_MAX(W+W2,1) };
+    typedef ac_fixed<w_shiftl, I, S> shiftl;
+  };
+
+  typedef ac_private::iv<N> Base;
+
+  inline void bit_adjust() {
+    const unsigned rem = (32-W)&31;
+    Base::v[N-1] =  S ? ((signed) ((unsigned)Base::v[N-1]  << rem) >> rem) : (rem ?
+                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
+  }
+  inline Base &base() { return *this; }
+  inline const Base &base() const { return *this; }
+
+  inline void overflow_adjust(bool overflow, bool neg) {
+    if(O==AC_WRAP) {
+      bit_adjust();
+      return;
+    }
+    else if(O==AC_SAT_ZERO) {
+      if(overflow)
+        ac_private::iv_extend<N>(Base::v, 0);
+      else
+        bit_adjust();
+    }
+    else if(S) {
+      if(overflow) {
+        if(!neg) {
+          ac_private::iv_extend<N-1>(Base::v, ~0);
+          Base::v[N-1] = ~((unsigned)~0 << ((W-1)&31));
+        } else {
+          ac_private::iv_extend<N-1>(Base::v, 0);
+          Base::v[N-1] = ((unsigned)~0 << ((W-1)&31));
+          if(O==AC_SAT_SYM)
+            Base::v[0] |= 1;
+        }
+      } else
+        bit_adjust();
+    }
+    else {
+      if(overflow) {
+        if(!neg) {
+          ac_private::iv_extend<N-1>(Base::v, ~0);
+          Base::v[N-1] = ~((unsigned)~0 << (W&31));
+        } else
+          ac_private::iv_extend<N>(Base::v, 0);
+      } else
+        bit_adjust();
+    }
+  }
+
+  inline bool quantization_adjust(bool qb, bool r, bool s) {
+    if(Q==AC_TRN)
+      return false;
+    if(Q==AC_RND_ZERO)
+      qb &= s || r;
+    else if(Q==AC_RND_MIN_INF)
+      qb &= r;
+    else if(Q==AC_RND_INF)
+      qb &= !s || r;
+    else if(Q==AC_RND_CONV)
+      qb &= (Base::v[0] & 1) || r;
+    else if(Q==AC_RND_CONV_ODD)
+      qb &= (!(Base::v[0] & 1)) || r;
+    else if(Q==AC_TRN_ZERO)
+      qb = s && ( qb || r );
+    return ac_private::iv_uadd_carry<N>(Base::v, qb, Base::v);
+  }
+
+  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
+
+public:
+  static const int width = W;
+  static const int i_width = I;
+  static const bool sign = S;
+  static const ac_o_mode o_mode = O;
+  static const ac_q_mode q_mode = Q;
+  static const int e_width = 0;
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+  static const bool compute_overflow_for_wrap = true;
+#else
+  static const bool compute_overflow_for_wrap = false;
+#endif
+
+  template<int W2, int I2, bool S2>
+  struct rt {
+    enum {
+      F=W-I,
+      F2=W2-I2,
+      mult_w = W+W2,
+      mult_i = I+I2,
+      mult_s = S||S2,
+      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      plus_s = S||S2,
+      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      minus_s = true,
+      div_w = W+AC_MAX(W2-I2,0)+S2,
+      div_i = I+(W2-I2)+S2,
+      div_s = S||S2,
+      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
+      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
+      logic_s = S||S2
+    };
+    typedef ac_fixed<mult_w, mult_i, mult_s> mult;
+    typedef ac_fixed<plus_w, plus_i, plus_s> plus;
+    typedef ac_fixed<minus_w, minus_i, minus_s> minus;
+    typedef ac_fixed<logic_w, logic_i, logic_s> logic;
+    typedef ac_fixed<div_w, div_i, div_s> div;
+    typedef ac_fixed<W, I, S> arg1;
+  };
+
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::mult mult;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::plus plus;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus minus;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus2 minus2;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::logic logic;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div div;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div2 div2;
+    typedef ac_fixed<W, I, S> arg1;
+  };
+
+  struct rt_unary {
+    enum {
+      neg_w = W+1,
+      neg_i = I+1,
+      neg_s = true,
+      mag_sqr_w = 2*W-S,
+      mag_sqr_i = 2*I-S,
+      mag_sqr_s = false,
+      mag_w = W+S,
+      mag_i = I+S,
+      mag_s = false,
+      leading_sign_w = ac::log2_ceil<W+!S>::val,
+      leading_sign_s = false
+    };
+    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
+    typedef ac_fixed<neg_w, neg_i, neg_s> neg;
+    typedef ac_fixed<mag_sqr_w, mag_sqr_i, mag_sqr_s> mag_sqr;
+    typedef ac_fixed<mag_w, mag_i, mag_s> mag;
+    template<unsigned N>
+    struct set {
+      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_s = S};
+      typedef ac_fixed<sum_w, sum_i, sum_s> sum;
+    };
+  };
+
+  ac_fixed(const ac_fixed &op): Base(op) { }
+
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
+  ac_fixed() {
+#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
+    bit_adjust();
+    if( O==AC_SAT_SYM && S && Base::v[N-1] < 0 && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true) )
+      Base::v[0] |= 1;
+#endif
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  inline ac_fixed (const ac_fixed<W2,I2,S2,Q2,O2> &op) {
+    enum {N2=(W2+31+!S2)/32, F=W-I, F2=W2-I2, QUAN_INC = F2>F && !(Q==AC_TRN || (Q==AC_TRN_ZERO && !S2)) };
+    bool carry = false;
+    // handle quantization
+    if(F2 == F)
+      Base::operator =(op);
+    else if(F2 > F) {
+      op.template const_shift_r<N,F2-F>(*this);
+//      ac_private::iv_const_shift_r<N2,N,F2-F>(op.v, Base::v);
+      if(Q!=AC_TRN && !(Q==AC_TRN_ZERO && !S2)) {
+        bool qb = (F2-F > W2) ? (op.v[N2-1] < 0) : (bool) op[F2-F-1];
+        bool r = (F2 > F+1) ? !ac_private::iv_equal_zeros_to<F2-F-1,N2>(op.v) : false;
+        carry = quantization_adjust(qb, r, S2 && op.v[N2-1] < 0);
+      }
+    }
+    else  // no quantization
+      op.template const_shift_l<N,F-F2>(*this);
+//      ac_private::iv_const_shift_l<N2,N,F-F2>(op.v, Base::v);
+    // handle overflow
+    if((O!=AC_WRAP || compute_overflow_for_wrap)
+       && ((!S && S2) || I-S < I2-S2+(QUAN_INC || (S2 && O==AC_SAT_SYM && (O2 != AC_SAT_SYM || F2 > F) )))
+    ) { // saturation
+      bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
+      bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
+      bool neg_src;
+      if(F2-F+32*N < W2) {
+        bool all_ones = ac_private::iv_equal_ones_from<F2-F+32*N,N2>(op.v);
+        deleted_bits_zero = deleted_bits_zero && (carry ? all_ones : ac_private::iv_equal_zeros_from<F2-F+32*N,N2>(op.v));
+        deleted_bits_one = deleted_bits_one && (carry ? ac_private::iv_equal_ones_from<1+F2-F+32*N,N2>(op.v) && !op[F2-F+32*N] : all_ones);
+        neg_src = S2 && op.v[N2-1] < 0 && !(carry & all_ones);
+      }
+      else
+        neg_src = S2 && op.v[N2-1] < 0 && Base::v[N-1] < 0;
+      bool neg_trg = S && (bool) this->operator[](W-1);
+      bool overflow = !neg_src && (neg_trg || !deleted_bits_zero);
+      overflow |= neg_src && (!neg_trg || !deleted_bits_one);
+      if(O==AC_SAT_SYM && S && S2)
+        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
+      overflow_adjust(overflow, neg_src);
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+    __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,op);
+#endif
+    }
+    else
+      bit_adjust();
+  }
+
+  template<int W2, bool S2>
+  inline ac_fixed (const ac_int<W2,S2> &op) {
+    ac_fixed<W2,W2,S2> f_op;
+    f_op.base().operator =(op);
+    *this = f_op;
+  }
+
+  template<int W2>
+  typename rt_priv<W2>::shiftl shiftl() const {
+    typedef typename rt_priv<W2>::shiftl shiftl_t;
+    shiftl_t r;
+    Base::template const_shift_l<shiftl_t::N,W2>(r);
+    return r;
+  }
+
+  inline ac_fixed( bool b ) { *this = (ac_int<1,false>) b; }
+  inline ac_fixed( char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_fixed( signed char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_fixed( unsigned char b ) { *this = (ac_int<8,false>) b; }
+  inline ac_fixed( signed short b ) { *this = (ac_int<16,true>) b; }
+  inline ac_fixed( unsigned short b ) { *this = (ac_int<16,false>) b; }
+  inline ac_fixed( signed int b ) { *this = (ac_int<32,true>) b; }
+  inline ac_fixed( unsigned int b ) { *this = (ac_int<32,false>) b; }
+  inline ac_fixed( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
+  inline ac_fixed( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
+  inline ac_fixed( Slong b ) { *this = (ac_int<64,true>) b; }
+  inline ac_fixed( Ulong b ) { *this = (ac_int<64,false>) b; }
+
+  inline ac_fixed( double d ) {
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool o, qb, r;
+    bool neg_src = d < 0;
+    Base::conv_from_fraction(di, &qb, &r, &o);
+    quantization_adjust(qb, r, neg_src);
+    // a neg number may become non neg (0) after quantization
+    neg_src &= o || Base::v[N-1] < 0;
+
+    if(O!=AC_WRAP || compute_overflow_for_wrap) { // saturation
+      bool overflow;
+      bool neg_trg = S && (bool) this->operator[](W-1);
+      if(o) {
+        overflow = true;
+      } else {
+        bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
+        bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
+        overflow = !neg_src && (neg_trg || !deleted_bits_zero);
+        overflow |= neg_src && (!neg_trg || !deleted_bits_one);
+      }
+      if(O==AC_SAT_SYM && S)
+        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
+      overflow_adjust(overflow, neg_src);
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+      __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,d);
+#endif
+    } else
+      bit_adjust();
+  }
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+  template<ac_special_val V>
+  inline ac_fixed &set_val() {
+    if(V == AC_VAL_DC) {
+      ac_fixed r;
+      Base::operator =(r);
+      bit_adjust();
+    }
+    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+      Base::operator =(0);
+      if(S && V == AC_VAL_MIN) {
+        const unsigned rem = (W-1)&31;
+        Base::v[N-1] = ((unsigned)~0 << rem);
+        if(O == AC_SAT_SYM) {
+          if(W == 1)
+            Base::v[0] = 0;
+          else
+            Base::v[0] |= 1;
+        }
+      } else if(V == AC_VAL_QUANTUM)
+        Base::v[0] = 1;
+    }
+    else {  // AC_VAL_MAX
+      Base::operator =(-1);
+      const unsigned int rem = (32-W - (unsigned) !S )&31;
+      Base::v[N-1] = ((unsigned) (-1) >> 1) >> rem;
+    }
+    return *this;
+  }
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+  // Explicit conversion functions to ac_int that captures all integer bits (bits are truncated)
+  inline ac_int<AC_MAX(I,1),S> to_ac_int() const { return ((ac_fixed<AC_MAX(I,1),AC_MAX(I,1),S>) *this).template slc<AC_MAX(I,1)>(0); }
+
+  // Explicit conversion functions to C built-in types -------------
+  inline int to_int() const { return ((I-W) >= 32) ? 0 : (signed int) to_ac_int(); }
+  inline unsigned to_uint() const { return ((I-W) >= 32) ? 0 : (unsigned int) to_ac_int(); }
+  inline long to_long() const { return ((I-W) >= ac_private::long_w) ? 0 : (signed long) to_ac_int(); }
+  inline unsigned long to_ulong() const { return ((I-W) >= ac_private::long_w) ? 0 : (unsigned long) to_ac_int(); }
+  inline Slong to_int64() const { return ((I-W) >= 64) ? 0 : (Slong) to_ac_int(); }
+  inline Ulong to_uint64() const { return ((I-W) >= 64) ? 0 : (Ulong) to_ac_int(); }
+  inline double to_double() const { return ac_private::ldexpr<I-W>(Base::to_double()); }
+
+  inline int length() const { return W; }
+
+  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
+    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
+    char r[(W-AC_MIN(AC_MIN(W-I,I),0)+31)/32*32+5] = {0};
+    int i = 0;
+    if(sign_mag)
+      r[i++] = is_neg() ? '-' : '+';
+    else if (base_rep == AC_DEC && is_neg())
+      r[i++] = '-';
+    if(base_rep != AC_DEC) {
+      r[i++] = '0';
+      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
+    }
+    ac_fixed<W+1, I+1, true> t;
+    if( (base_rep == AC_DEC || sign_mag) && is_neg() )
+      t = operator -();
+    else
+      t = *this;
+    ac_fixed<AC_MAX(I+1,1),AC_MAX(I+1,1),true> i_part = t;
+    ac_fixed<AC_MAX(W-I,1),0,false> f_part = t;
+    i += ac_private::to_string(i_part.v, AC_MAX(I+1,1), sign_mag, base_rep, false, r+i);
+    if(W-I > 0) {
+      r[i++] = '.';
+      if(!ac_private::to_string(f_part.v, W-I, false, base_rep, true, r+i))
+        r[--i] = 0;
+    }
+    if(!i) {
+      r[0] = '0';
+      r[1] = 0;
+    }
+    return std::string(r);
+  }
+  inline static std::string type_name() {
+    const char *tf[] = {"false", "true" };
+    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV", "AC_RND_CONV_ODD" };
+    const char *o[] = {"AC_WRAP", "AC_SAT", "AC_SAT_ZERO", "AC_SAT_SYM" };
+    std::string r = "ac_fixed<";
+    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
+    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
+    r += tf[S];
+    r += ',';
+    r += q[Q];
+    r += ',';
+    r += o[O];
+    r += '>';
+    return r;
+  }
+
+  // Arithmetic : Binary ----------------------------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::mult operator *( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    typename rt<W2,I2,S2>::mult r;
+    Base::mult(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::plus operator +( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::plus r;
+    if(F == F2)
+      Base::add(op2, r);
+    else if(F > F2)
+      Base::add(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().add(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::minus operator -( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::minus r;
+    if(F == F2)
+      Base::sub(op2, r);
+    else if(F > F2)
+      Base::sub(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().sub(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wenum-compare"
+#endif
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::div operator /( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    typename rt<W2,I2,S2>::div r;
+    enum { Num_w = W+AC_MAX(W2-I2,0), Num_i = I, Num_w_minus = Num_w+S, Num_i_minus = Num_i+S,
+          N1 = ac_fixed<Num_w,Num_i,S>::N, N1minus = ac_fixed<Num_w_minus,Num_i_minus,S>::N,
+          N2 = ac_fixed<W2,I2,S2>::N, N2minus = ac_fixed<W2+S2,I2+S2,S2>::N,
+          num_s = S + (N1minus > N1), den_s = S2 + (N2minus > N2), Nr = rt<W2,I2,S2>::div::N };
+    ac_fixed<Num_w, Num_i, S> t = *this;
+    t.template div<num_s, N2, den_s, Nr>(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+  // Arithmetic assign  ------------------------------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator *=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator *(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator +=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator +(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator -=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator -(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator /=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator /(op2);
+    return *this;
+  }
+  // increment/decrement by quantum (smallest difference that can be represented)
+  // Arithmetic prefix increment, decrement ---------------------------------
+  ac_fixed &operator ++() {
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator += (q);
+    return *this;
+  }
+  ac_fixed &operator --() {
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator -= (q);
+    return *this;
+  }
+  // Arithmetic postfix increment, decrement ---------------------------------
+  const ac_fixed operator ++(int) {
+    ac_fixed t = *this;
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator += (q);
+    return t;
+  }
+  const ac_fixed operator --(int) {
+    ac_fixed t = *this;
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator -= (q);
+    return t;
+  }
+  // Arithmetic Unary --------------------------------------------------------
+  ac_fixed operator +() {
+    return *this;
+  }
+  typename rt_unary::neg operator -() const {
+    typename rt_unary::neg r;
+    Base::neg(r);
+    r.bit_adjust();
+    return r;
+  }
+  // ! ------------------------------------------------------------------------
+  bool operator ! () const {
+    return Base::equal_zero();
+  }
+
+  // Bitwise (arithmetic) unary: complement  -----------------------------
+  ac_fixed<W+!S, I+!S, true> operator ~() const {
+    ac_fixed<W+!S, I+!S, true> r;
+    Base::bitwise_complement(r);
+    return r;
+  }
+  // Bitwise (not arithmetic) bit complement  -----------------------------
+  ac_fixed<W, I, false> bit_complement() const {
+    ac_fixed<W, I, false> r;
+    Base::bitwise_complement(r);
+    r.bit_adjust();
+    return r;
+  }
+  // Bitwise (not arithmetic): and, or, xor ----------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::logic operator &( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::logic r;
+    if(F == F2)
+      Base::bitwise_and(op2, r);
+    else if(F > F2)
+      Base::bitwise_and(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().bitwise_and(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::logic operator |( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::logic r;
+    if(F == F2)
+      Base::bitwise_or(op2, r);
+    else if(F > F2)
+      Base::bitwise_or(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().bitwise_or(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::logic operator ^( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::logic r;
+    if(F == F2)
+      Base::bitwise_xor(op2, r);
+    else if(F > F2)
+      Base::bitwise_xor(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().bitwise_xor(op2, r);
+    return r;
+  }
+  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator &= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
+    *this = this->operator &(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator |= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
+    *this = this->operator |(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator ^= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
+    *this = this->operator ^(op2);
+    return *this;
+  }
+  // Shift (result constrained by left operand) -------------------------------
+  template<int W2>
+  ac_fixed operator << ( const ac_int<W2,true> &op2 ) const {
+    // currently not written to overflow or quantize (neg shift)
+    ac_fixed r;
+    Base::shift_l2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_fixed operator << ( const ac_int<W2,false> &op2 ) const {
+    // currently not written to overflow
+    ac_fixed r;
+    Base::shift_l(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_fixed operator >> ( const ac_int<W2,true> &op2 ) const {
+    // currently not written to quantize or overflow (neg shift)
+    ac_fixed r;
+    Base::shift_r2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_fixed operator >> ( const ac_int<W2,false> &op2 ) const {
+    // currently not written to quantize
+    ac_fixed r;
+    Base::shift_r(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  // Shift assign ------------------------------------------------------------
+  template<int W2>
+  ac_fixed operator <<= ( const ac_int<W2,true> &op2 ) {
+    // currently not written to overflow or quantize (neg shift)
+    Base r;
+    Base::shift_l2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_fixed operator <<= ( const ac_int<W2,false> &op2 ) {
+    // currently not written to overflow
+    Base r;
+    Base::shift_l(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_fixed operator >>= ( const ac_int<W2,true> &op2 ) {
+    // currently not written to quantize or overflow (neg shift)
+    Base r;
+    Base::shift_r2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_fixed operator >>= ( const ac_int<W2,false> &op2 ) {
+    // currently not written to quantize
+    Base r;
+    Base::shift_r(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  // Relational ---------------------------------------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator == ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return Base::equal(op2);
+    else if(F > F2)
+      return Base::equal(op2.template shiftl<F-F2>());
+    else
+      return shiftl<F2-F>().equal(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator != ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return ! Base::equal(op2);
+    else if(F > F2)
+      return ! Base::equal(op2.template shiftl<F-F2>());
+    else
+      return ! shiftl<F2-F>().equal(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator < ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return Base::less_than(op2);
+    else if(F > F2)
+      return Base::less_than(op2.template shiftl<F-F2>());
+    else
+      return shiftl<F2-F>().less_than(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator >= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return ! Base::less_than(op2);
+    else if(F > F2)
+      return ! Base::less_than(op2.template shiftl<F-F2>());
+    else
+      return ! shiftl<F2-F>().less_than(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator > ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return Base::greater_than(op2);
+    else if(F > F2)
+      return Base::greater_than(op2.template shiftl<F-F2>());
+    else
+      return shiftl<F2-F>().greater_than(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator <= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return ! Base::greater_than(op2);
+    else if(F > F2)
+      return ! Base::greater_than(op2.template shiftl<F-F2>());
+    else
+      return ! shiftl<F2-F>().greater_than(op2);
+  }
+  bool operator == ( double d) const {
+    if(is_neg() != (d < 0.0))
+      return false;
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool overflow, qb, r;
+    ac_fixed<W,I,S> t;
+    t.conv_from_fraction(di, &qb, &r, &overflow);
+    if(qb || r || overflow)
+      return false;
+    return operator == (t);
+  }
+  bool operator != ( double d) const {
+    return !operator == ( d );
+  }
+  bool operator < ( double d) const {
+    if(is_neg() != (d < 0.0))
+      return is_neg();
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool overflow, qb, r;
+    ac_fixed<W,I,S> t;
+    t.conv_from_fraction(di, &qb, &r, &overflow);
+    if(is_neg() && overflow)
+      return false;
+    return (!is_neg() && overflow) || ((qb || r) && operator <= (t)) || operator < (t);
+  }
+  bool operator >= ( double d) const {
+    return !operator < ( d );
+  }
+  bool operator > ( double d) const {
+    if(is_neg() != (d < 0.0))
+      return !is_neg();
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool overflow, qb, r;
+    ac_fixed<W,I,S> t;
+    t.conv_from_fraction(di, &qb, &r, &overflow);
+    if(!is_neg() && overflow )
+      return false;
+    return (is_neg() && overflow) || operator > (t);
+  }
+  bool operator <= ( double d) const {
+    return !operator > ( d );
+  }
+
+  // Bit and Slice Select -----------------------------------------------------
+  template<int WS, int WX, bool SX>
+  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int WS>
+  inline const ac_int<WS,S> slc(signed index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int WS>
+  inline const ac_int<WS,S> slc(unsigned uindex) const {
+    ac_int<WS,S> r;
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int W2, bool S2, int WX, bool SX>
+  inline ac_fixed &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_fixed &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_fixed &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+
+  template<int Msb, int Lsb>
+  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
+    #if __cplusplus > 199711L
+    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
+    static_assert(Lsb >= 0, "LSB is negative");
+    static_assert(Msb < W, "MSB >= W");
+    #endif
+    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
+  }
+
+  class ac_bitref {
+# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+# pragma builtin
+# endif
+    ac_fixed &d_bv;
+    unsigned d_index;
+  public:
+    ac_bitref( ac_fixed *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
+    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
+
+    inline ac_bitref operator = ( int val ) {
+      // lsb of int (val&1) is written to bit
+      if(d_index < W) {
+        int *pval = &d_bv.v[d_index>>5];
+        *pval ^= (*pval ^ ((unsigned) val << (d_index&31) )) & 1 << (d_index&31);
+        d_bv.bit_adjust();   // in case sign bit was assigned
+      }
+      return *this;
+    }
+    template<int W2, bool S2>
+    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
+      return operator =(val.to_int());
+    }
+    inline ac_bitref operator = ( const ac_bitref &val ) {
+      return operator =((int) (bool) val);
+    }
+  };
+
+  ac_bitref operator [] ( unsigned int uindex) {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  ac_bitref operator [] ( int index) {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  template<int W2, bool S2>
+  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+
+  bool operator [] ( unsigned int uindex) const {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  bool operator [] ( int index) const {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  template<int W2, bool S2>
+  bool operator [] ( const ac_int<W2,S2> &index) const {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  typename rt_unary::leading_sign leading_sign() const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    return ls;
+  }
+  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    all_sign = (ls == W-S);
+    return ls;
+  }
+  // returns false if number is denormal
+  template<int WE, bool SE>
+  bool normalize(ac_int<WE,SE> &exp) {
+    ac_int<W,S> m = this->template slc<W>(0);
+    bool r = m.normalize(exp);
+    this->set_slc(0,m);
+    return r;
+  }
+  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
+  template<int WE, bool SE>
+  bool normalize_RME(ac_int<WE,SE> &exp) {
+    ac_int<W,S> m = this->template slc<W>(0);
+    bool r = m.normalize_RME(exp);
+    this->set_slc(0,m);
+    return r;
+  }
+  inline void bit_fill_hex(const char *str) {
+    // Zero Pads if str is too short, throws ms bits away if str is too long
+    // Asserts if anything other than 0-9a-fA-F is encountered
+    ac_int<W,S> x;
+    x.bit_fill_hex(str);
+    set_slc(0, x);
+  }
+  template<int N>
+  inline void bit_fill(const int (&ivec)[N], bool bigendian=true) {
+    // bit_fill from integer vector
+    //   if W > N*32, missing most significant bits are zeroed
+    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
+    //
+    // Example:
+    //   ac_fixed<80,40,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
+    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
+    ac_int<W,S> x;
+    x.bit_fill(ivec, bigendian);
+    set_slc(0, x);
+  }
+};
+
+namespace ac {
+  template<typename T>
+  struct ac_fixed_represent {
+    enum { t_w = ac_private::c_type_params<T>::W, t_i = t_w, t_s = ac_private::c_type_params<T>::S };
+    typedef ac_fixed<t_w,t_i,t_s> type;
+  };
+  template<> struct ac_fixed_represent<float> {};
+  template<> struct ac_fixed_represent<double> {};
+  template<int W, bool S>
+  struct ac_fixed_represent< ac_int<W,S> > {
+    typedef ac_fixed<W,W,S> type;
+  };
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  struct ac_fixed_represent< ac_fixed<W,I,S,Q,O> > {
+    typedef ac_fixed<W,I,S,Q,O> type;
+  };
+}
+
+namespace ac_private {
+  // with T == ac_fixed
+  template<int W2, int I2, bool S2>
+  struct rt_ac_fixed_T< ac_fixed<W2,I2,S2> > {
+    typedef ac_fixed<W2,I2,S2> fx2_t;
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef ac_fixed<W,I,S> fx_t;
+      typedef typename fx_t::template rt<W2,I2,S2>::mult mult;
+      typedef typename fx_t::template rt<W2,I2,S2>::plus plus;
+      typedef typename fx_t::template rt<W2,I2,S2>::minus minus;
+      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
+      typedef typename fx_t::template rt<W2,I2,S2>::logic logic;
+      typedef typename fx_t::template rt<W2,I2,S2>::div div;
+      typedef typename fx2_t::template rt<W,I,S>::div div2;
+    };
+  };
+  // with T == ac_int
+  template<int W2, bool S2>
+  struct rt_ac_fixed_T< ac_int<W2,S2> > {
+    typedef ac_fixed<W2,W2,S2> fx2_t;
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef ac_fixed<W,I,S> fx_t;
+      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
+      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
+      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
+      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
+      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
+      typedef typename fx_t::template rt<W2,W2,S2>::div div;
+      typedef typename fx2_t::template rt<W,I,S>::div div2;
+    };
+  };
+
+  template<typename T>
+  struct rt_ac_fixed_T< c_type<T> > {
+    typedef typename ac::ac_fixed_represent<T>::type fx2_t;
+    enum { W2 = fx2_t::width, I2 = W2, S2 = fx2_t::sign };
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef ac_fixed<W,I,S> fx_t;
+      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
+      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
+      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
+      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
+      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
+      typedef typename fx_t::template rt<W2,W2,S2>::div div;
+      typedef typename fx2_t::template rt<W,I,S>::div div2;
+    };
+  };
+}
+
+
+// Specializations for constructors on integers that bypass bit adjusting
+//  and are therefore more efficient
+template<> inline ac_fixed<1,1,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b ? -1 : 0; }
+
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed long b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned long b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b&1; }
+
+template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = (signed char) b; }
+template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned char) b; }
+
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned short) b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = (signed short) b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = (unsigned short) b; }
+
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; }
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; }
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; v[1] = 0;}
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; v[1] = 0;}
+
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; }
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; }
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = 0;}
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = 0;}
+
+template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
+template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
+template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
+template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
+
+
+// Stream --------------------------------------------------------------------
+
+template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline std::ostream& operator << (std::ostream &os, const ac_fixed<W,I,S,Q,O> &x) {
+#ifndef __SYNTHESIS__
+  if ((os.flags() & std::ios::hex) != 0) {
+    os << x.to_string(AC_HEX);
+  } else if ((os.flags() & std::ios::oct) != 0) {
+    os << x.to_string(AC_OCT);
+  } else {
+    os << x.to_string(AC_DEC);
+  }
+#endif
+  return os;
+}
+
+
+// Macros for Binary Operators with C Integers --------------------------------------------
+
+#define FX_BIN_OP_WITH_INT_2I(BIN_OP, C_TYPE, WI, SI)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline ac_fixed<W,I,S,Q,O> operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
+    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
+  }
+
+#define FX_BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
+  } \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
+    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
+  }
+
+#define FX_REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
+    return op.operator REL_OP (ac_fixed<W2,W2,S2>(op2));  \
+  }  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline bool operator REL_OP ( C_TYPE op2, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<W2,W2,S2>(op2).operator REL_OP (op);  \
+  }
+
+#define FX_ASSIGN_OP_WITH_INT_2(ASSIGN_OP, C_TYPE, W2, S2)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
+    return op.operator ASSIGN_OP (ac_fixed<W2,W2,S2>(op2));  \
+  }
+
+#define FX_ASSIGN_OP_WITH_INT_2I(ASSIGN_OP, C_TYPE, W2, S2)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline ac_fixed<W,I,S> operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
+    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
+  }
+
+#define FX_OPS_WITH_INT(C_TYPE, WI, SI) \
+  FX_BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
+  FX_BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
+  FX_BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
+  FX_BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
+  FX_BIN_OP_WITH_INT_2I(>>, C_TYPE, WI, SI) \
+  FX_BIN_OP_WITH_INT_2I(<<, C_TYPE, WI, SI) \
+  FX_BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
+  FX_BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
+  FX_BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
+  \
+  FX_REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
+  \
+  FX_ASSIGN_OP_WITH_INT_2(+=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(-=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(*=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(/=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2I(>>=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2I(<<=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(&=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(|=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(^=, C_TYPE, WI, SI)
+
+// --------------------------------------- End of Macros for Binary Operators with C Integers
+
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+namespace ac {
+  namespace ops_with_other_types {
+#endif
+    // Binary Operators with C Integers --------------------------------------------
+    FX_OPS_WITH_INT(bool, 1, false)
+    FX_OPS_WITH_INT(char, 8, true)
+    FX_OPS_WITH_INT(signed char, 8, true)
+    FX_OPS_WITH_INT(unsigned char, 8, false)
+    FX_OPS_WITH_INT(short, 16, true)
+    FX_OPS_WITH_INT(unsigned short, 16, false)
+    FX_OPS_WITH_INT(int, 32, true)
+    FX_OPS_WITH_INT(unsigned int, 32, false)
+    FX_OPS_WITH_INT(long, ac_private::long_w, true)
+    FX_OPS_WITH_INT(unsigned long, ac_private::long_w, false)
+    FX_OPS_WITH_INT(Slong, 64, true)
+    FX_OPS_WITH_INT(Ulong, 64, false)
+    // -------------------------------------- End of Binary Operators with Integers
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+  }  // ops_with_other_types namespace
+} // ac namespace
+#endif
+
+
+// Macros for Binary Operators with ac_int --------------------------------------------
+
+#define FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
+  }
+
+#define FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &i_op) {  \
+    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
+  }
+
+#define FX_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
+  FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
+  FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
+
+#define FX_REL_OP_WITH_AC_INT(REL_OP)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
+    return op.operator REL_OP (ac_fixed<WI,WI,SI>(op2));  \
+  }  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<WI,WI,SI>(op2).operator REL_OP (op);  \
+  }
+
+#define FX_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
+    return op.operator ASSIGN_OP (ac_fixed<WI,WI,SI>(op2));  \
+  }  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline ac_int<WI,SI> &operator ASSIGN_OP ( ac_int<WI,SI> &op, const ac_fixed<W,I,S,Q,O> &op2) {  \
+    return op.operator ASSIGN_OP (op2.to_ac_int());  \
+  }
+
+// -------------------------------------------- End of Macros for Binary Operators with ac_int
+
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+namespace ac {
+  namespace ops_with_other_types {
+#endif
+    // Binary Operators with ac_int --------------------------------------------
+    FX_BIN_OP_WITH_AC_INT(*, mult)
+    FX_BIN_OP_WITH_AC_INT(+, plus)
+    FX_BIN_OP_WITH_AC_INT(-, minus)
+    FX_BIN_OP_WITH_AC_INT(/, div)
+    FX_BIN_OP_WITH_AC_INT(&, logic)
+    FX_BIN_OP_WITH_AC_INT(|, logic)
+    FX_BIN_OP_WITH_AC_INT(^, logic)
+
+    FX_REL_OP_WITH_AC_INT(==)
+    FX_REL_OP_WITH_AC_INT(!=)
+    FX_REL_OP_WITH_AC_INT(>)
+    FX_REL_OP_WITH_AC_INT(>=)
+    FX_REL_OP_WITH_AC_INT(<)
+    FX_REL_OP_WITH_AC_INT(<=)
+
+    FX_ASSIGN_OP_WITH_AC_INT(+=)
+    FX_ASSIGN_OP_WITH_AC_INT(-=)
+    FX_ASSIGN_OP_WITH_AC_INT(*=)
+    FX_ASSIGN_OP_WITH_AC_INT(/=)
+    FX_ASSIGN_OP_WITH_AC_INT(&=)
+    FX_ASSIGN_OP_WITH_AC_INT(|=)
+    FX_ASSIGN_OP_WITH_AC_INT(^=)
+    // -------------------------------------- End of Binary Operators with ac_int
+
+    // Relational Operators with double --------------------------------------
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator == ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator == (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator != ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator != (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator > ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator < (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator < ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator > (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator <= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator >= (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator >= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator <= (op);
+    }
+    // -------------------------------------- End of Relational Operators with double
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+  }  // ops_with_other_types namespace
+} // ac namespace
+using namespace ac::ops_with_other_types;
+#endif
+
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+
+// Global templatized functions for easy initialization to special values
+template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>) {
+  ac_fixed<W,I,S> r;
+  return r.template set_val<V>();
+}
+
+namespace ac {
+// PUBLIC FUNCTIONS
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  inline bool init_array(ac_fixed<W,I,S,Q,O> *a, int n) {
+    ac_fixed<W,I,S> t;
+    t.template set_val<V>();
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+
+  inline ac_fixed<54,2,true> frexp_d(double d, ac_int<11,true> &exp) {
+    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!d) {
+      exp = 0;
+      return 0;
+    }
+    int exp_i;
+    double f0 = frexp(d, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i > Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
+    exp = exp_i + rshift;
+    ac_int<Mant_W+2,true> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
+    ac_fixed<Mant_W+2,2,true> r;
+    r.set_slc(0, f_i);
+    return r;
+  }
+  inline ac_fixed<25,2,true> frexp_f(float f, ac_int<8,true> &exp) {
+    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!f) {
+      exp = 0;
+      return 0;
+    }
+    int exp_i;
+    float f0 = frexpf(f, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i >= Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
+    exp = exp_i + rshift;
+    ac_int<Mant_W+2,true> f_i = f0 * (1 << (Mant_W + 1 - rshift));
+    ac_fixed<Mant_W+2,2,true> r;
+    r.set_slc(0, f_i);
+    return r;
+  }
+
+  inline ac_fixed<53,1,false> frexp_sm_d(double d, ac_int<11,true> &exp, bool &sign) {
+    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!d) {
+      exp = 0;
+      sign = false;
+      return 0;
+    }
+    int exp_i;
+    bool s = d < 0;
+    double f0 = frexp(s ? -d : d, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
+    exp = exp_i + rshift;
+    ac_int<Mant_W+1,false> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
+    ac_fixed<Mant_W+1,1,false> r;
+    r.set_slc(0, f_i);
+    sign = s;
+    return r;
+  }
+  inline ac_fixed<24,1,false> frexp_sm_f(float f, ac_int<8,true> &exp, bool &sign) {
+    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!f) {
+      exp = 0;
+      sign = false;
+      return 0;
+    }
+    int exp_i;
+    bool s = f < 0;
+    float f0 = frexp(s ? -f : f, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
+    exp = exp_i + rshift;
+    ac_int<24,false> f_i = f0 * (1 << (Mant_W + 1 - rshift));
+    ac_fixed<24,1,false> r;
+    r.set_slc(0, f_i);
+    sign = s;
+    return r;
+  }
+
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  const ac_fixed<W,I,S,Q,O> &basic_num_ovf_base<W,I,S,Q,O>::value() const {
+    return (const ac_fixed<W,I,S,Q,O> &) *this;
+  }
+
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> std::string basic_num_ovf_base<W,I,S,Q,O>::type_name() {
+    return ac_fixed<W,I,S,Q,O>::type_name();
+  }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_FIXED_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_float.h b/hls4ml/templates/quartus/ac_types/ac_float.h
index 6174528d73..9229b54702 100644
--- a/hls4ml/templates/quartus/ac_types/ac_float.h
+++ b/hls4ml/templates/quartus/ac_types/ac_float.h
@@ -1,1196 +1,1196 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2013-2019, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-//  Source:         ac_float.h
-//  Description:    class for floating point operation handling in C++
-//  Author:         Andres Takach, Ph.D.
-
-#ifndef __AC_FLOAT_H
-#define __AC_FLOAT_H
-
-#include <ac_fixed.h>
-
-#ifndef __SYNTHESIS__
-#include <cmath>
-#endif
-
-#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
-#error GCC version 3 or greater is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
-#error Microsoft Visual Studio 8 or newer is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4003 4127 4308 4365 4514 4800 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wparentheses"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wparentheses"
-#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
-#pragma clang diagnostic ignored "-Wbitwise-op-parentheses"
-#endif
-
-// for safety
-#if (defined(E) || defined(WF) || defined(IF) || defined(SF))
-#error One or more of the following is defined: E, WF, IF, SF. Definition conflicts with their usage as template parameters.
-#error DO NOT use defines before including third party header files.
-#endif
-
-#define AC_FL(v) ac_float<W##v,I##v,E##v,Q##v>
-#define AC_FL0(v) ac_float<W##v,I##v,E##v>
-#define AC_FL_T(v) int W##v, int I##v, int E##v, ac_q_mode Q##v
-#define AC_FL_TV(v) W##v, I##v, E##v, Q##v
-#define AC_FL_T0(v) int W##v, int I##v, int E##v
-#define AC_FL_TV0(v) W##v, I##v, E##v
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-template<int W, int I, int E, ac_q_mode Q=AC_TRN> class ac_float;
-
-namespace ac_private {
-
-  typedef ac_float<54,2,11> ac_float_cdouble_t;
-  typedef ac_float<25,2,8> ac_float_cfloat_t;
-
-  template<typename T>
-  struct rt_ac_float_T {
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename T::template rt_T<fl_t>::mult mult;
-      typedef typename T::template rt_T<fl_t>::plus plus;
-      typedef typename T::template rt_T<fl_t>::minus2 minus;
-      typedef typename T::template rt_T<fl_t>::minus minus2;
-      typedef typename T::template rt_T<fl_t>::logic logic;
-      typedef typename T::template rt_T<fl_t>::div2 div;
-      typedef typename T::template rt_T<fl_t>::div div2;
-    };
-  };
-  // specializations after definition of ac_float
-
-  inline ac_float_cdouble_t double_to_ac_float(double d);
-  inline ac_float_cfloat_t float_to_ac_float(float f);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//  ac_float
-//////////////////////////////////////////////////////////////////////////////
-
-template< AC_FL_T() >
-class ac_float {
-  enum { NO_UN = true, S = true, S2 = true, SR = true };
-public:
-  typedef ac_fixed<W,I,S> mant_t;
-  typedef ac_int<E,true> exp_t;
-  mant_t m;
-  exp_t e;
-
-  void set_mantissa(const ac_fixed<W,I,S> &man) { m = man; }
-  void set_exp(const ac_int<E,true> &exp) { if(E) e = exp; }
-
-private:
-  inline bool is_neg() const { return m < 0; }   // is_neg would be more efficient
-
-  enum {NZ_E = !!E, MIN_EXP = -(NZ_E << (E-NZ_E)), MAX_EXP = (1 << (E-NZ_E))-1};
-
-public:
-  static const int width = W;
-  static const int i_width = I;
-  static const int e_width = E;
-  static const bool sign = S;
-  static const ac_q_mode q_mode = Q;
-  static const ac_o_mode o_mode = AC_SAT;
-
-  template< AC_FL_T0(2) >
-  struct rt {
-    enum {
-      // need to validate
-      F=W-I,
-      F2=W2-I2,
-      mult_w = W+W2,
-      mult_i = I+I2,
-      mult_e = AC_MAX(E,E2)+1,
-      mult_s = S||S2,
-      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      plus_e = AC_MAX(E,E2),
-      plus_s = S||S2,
-      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      minus_e = AC_MAX(E,E2),
-      minus_s = true,
-      div_w = W+AC_MAX(W2-I2,0)+S2,
-      div_i = I+(W2-I2)+S2,
-      div_e = AC_MAX(E,E2)+1,
-      div_s = S||S2,
-      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
-      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
-      logic_s = S||S2,
-      logic_e = AC_MAX(E,E2)
-    };
-    typedef ac_float<mult_w, mult_i, mult_e> mult;
-    typedef ac_float<plus_w, plus_i, plus_e> plus;
-    typedef ac_float<minus_w, minus_i, minus_e> minus;
-    typedef ac_float<logic_w, logic_i, logic_e> logic;
-    typedef ac_float<div_w, div_i, div_e> div;
-    typedef ac_float arg1;
-
-  };
-
-  template<int WI, bool SI>
-  struct rt_i {
-    enum {
-      lshift_w = W,
-      lshift_i = I,
-      lshift_s = S,
-      lshift_e_0 = exp_t::template rt<WI,SI>::plus::width,
-      lshift_e = AC_MIN(lshift_e_0, 24),
-      rshift_w = W,
-      rshift_i = I,
-      rshift_s = S,
-      rshift_e_0 = exp_t::template rt<WI,SI>::minus::width,
-      rshift_e = AC_MIN(rshift_e_0, 24)
-    };
-    typedef ac_float<lshift_w, lshift_i, lshift_e> lshift;
-    typedef ac_float<rshift_w, rshift_i, rshift_e> rshift;
-  };
-
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus2;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div2;
-    typedef ac_float arg1;
-  };
-
-  template<typename T>
-  struct rt_T2 {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus2;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div2;
-    typedef ac_float arg1;
-  };
-
-  struct rt_unary {
-    enum {
-      neg_w = W+1,
-      neg_i = I+1,
-      neg_e = E,
-      neg_s = true,
-      mag_sqr_w = 2*W-S + NO_UN,
-      mag_sqr_i = 2*I-S + NO_UN,
-      mag_sqr_e = E,
-      mag_sqr_s = false | NO_UN,
-      mag_w = W+S + NO_UN,
-      mag_i = I+S + NO_UN,
-      mag_e = E,
-      mag_s = false | NO_UN,
-      to_fx_i = I + MAX_EXP,
-      to_fx_w = W + MAX_EXP - MIN_EXP,
-      to_fx_s = S,
-      to_i_w = AC_MAX(to_fx_i,1),
-      to_i_s = S
-    };
-    typedef ac_float<neg_w, neg_i, neg_e> neg;
-    typedef ac_float<mag_sqr_w, mag_sqr_i, mag_sqr_e> mag_sqr;
-    typedef ac_float<mag_w, mag_i, mag_e> mag;
-    template<unsigned N>
-    struct set {
-      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_e = E, sum_s = S};
-      typedef ac_float<sum_w, sum_i, sum_e> sum;
-    };
-    typedef ac_fixed<to_fx_w, to_fx_i, to_fx_s> to_ac_fixed_t;
-    typedef ac_int<to_i_w, to_i_s> to_ac_int_t;
-  };
-
-  template<AC_FL_T(2)> friend class ac_float;
-
-  ac_float() {
-#if defined(AC_DEFAULT_IN_RANGE)
-#endif
-  }
-  ac_float(const ac_float &op) {
-    m = op.m;
-    e = op.e;
-  }
-
-private:
-  template<int W2>
-  bool round(const ac_fixed<W2,I,true> &op2, bool assert_on_rounding=false) {
-    const bool rnd = Q!=AC_TRN && Q!=AC_TRN_ZERO && W2 > W;
-    bool rnd_ovfl = false;
-    m = 0;
-    if(rnd) {
-      ac_fixed<W+1,I+1,true,Q> m_1 = op2;
-      // overflow because of rounding would lead to go from 001111  to 01000 (extra bit prevents it)
-      //   change from 01000 to 00100 and store 0100 in m
-      rnd_ovfl = !m_1[W] & m_1[W-1];
-      m_1[W-1] = m_1[W-1] & !rnd_ovfl;
-      m_1[W-2] = m_1[W-2] | rnd_ovfl;
-      m.set_slc(0, m_1.template slc<W>(0));
-      if(assert_on_rounding)
-        AC_ASSERT(m == op2, "Loss of precision due to Rounding");
-      return rnd_ovfl;
-    } else {
-      ac_fixed<W,I,true,Q> m_0 = op2;
-      m.set_slc(0, m_0.template slc<W>(0));
-      return false;
-    }
-  }
-
-  template<int min_exp2, int max_exp2, int W2, int I2, ac_q_mode Q2, ac_o_mode O2>
-  void assign_from(const ac_fixed<W2,I2,true,Q2,O2> &m2, int e2, bool sticky_bit, bool normalize, bool assert_on_rounding=false) {
-    const bool rnd = Q!=AC_TRN & Q!=AC_TRN_ZERO & W2 > W;
-    const bool need_rnd_bit = Q != AC_TRN;
-    const bool need_rem_bits = need_rnd_bit && Q != AC_RND;
-
-    const int msb_min_power = I-1 + MIN_EXP;
-    const int msb_min_power2 = I2-1 + min_exp2;
-    const int msb_min_power_dif = msb_min_power - msb_min_power2;
-    //   if > 0: target has additional negative exponent range
-    //     subnormal maybe be further normalized (done even if normalize==false)
-    //   if < 0: target has less negative exponent range
-    //     mantissa may need to be shifted right
-    //   in either case if source is unnormalized
-    //     normalization could take place
-
-    const int msb_max_power = I-1 + MAX_EXP;
-    const int msb_max_power2 = I2-1 + max_exp2 + rnd;
-    const int msb_max_power_dif = msb_max_power - msb_max_power2;
-
-    const bool may_shift_right = msb_min_power_dif > 0;
-    const int max_right_shift = may_shift_right ? msb_min_power_dif : 0;
-    const int t_width = W2 + (W >= W2 ? AC_MIN(W-W2+may_shift_right, max_right_shift) : 0);
-
-    int e_t = e2;
-    e_t += I2-I;
-    typedef ac_fixed<t_width,I2,true,Q2,O2> op2_t;
-    op2_t op2 = m2;
-    int ls = 0;
-    bool r_zero;
-    if(normalize) {
-      bool all_sign;
-      ls = m2.leading_sign(all_sign);
-      r_zero = all_sign & !m2[0];
-    } else if(msb_min_power_dif < 0 || msb_max_power_dif < 0 || W2 > W) {
-      // msb_min_power_dif < 0: src exponent less negative than trg exp represents
-      //   oportunity to further normalize value in trg representation
-      // msb_max_power_dif < 0: max target exp is less than max src exp
-      //   if un-normalized exp may overflow resulting in incorrect saturation
-      //     normalization is needed for correctness
-      // W2 > W
-      //   if un-normalized, extra bits may be incorrectly quantized away
-      const int msb_range_dif = AC_MAX(-msb_min_power_dif, -msb_max_power_dif);
-      const int msb_range_dif_norm_w = AC_MIN(msb_range_dif,W2-1);
-      const int extra_bits = AC_MAX(W2-W,0);
-      const int norm_w = AC_MAX(msb_range_dif_norm_w, extra_bits) + 1;
-      bool all_sign;
-      ls = m2.template slc<norm_w>(W2-norm_w).leading_sign(all_sign);
-      r_zero = all_sign & !m2[W2-1] & !(m2 << norm_w);
-    } else {
-      r_zero = !m2;
-    }
-    int actual_max_shift_left = (1 << (E-1)) + e_t;
-    if(may_shift_right && actual_max_shift_left < 0) {
-      const int shift_r_w = ac::nbits<max_right_shift>::val;
-      ac_int<shift_r_w,false> shift_r = -actual_max_shift_left;
-      if((1 << (E-1)) + min_exp2 + I2-I < 0 && need_rem_bits) {
-        op2_t shifted_out_bits = op2;
-        shifted_out_bits &= ~((~op2_t(0)) << shift_r);
-        sticky_bit |= !!shifted_out_bits;
-      }
-      op2 >>= shift_r;
-      e_t += shift_r;
-    } else {
-      bool shift_exponent_limited = ls >= actual_max_shift_left;
-      int shift_l = shift_exponent_limited ? actual_max_shift_left : (int) ls;
-      op2 <<= shift_l;
-      e_t = shift_exponent_limited ? MIN_EXP : e_t - ls;
-    }
-    ac_fixed<t_width+need_rem_bits,I,true> r_pre_rnd = 0;
-    r_pre_rnd.set_slc(need_rem_bits, op2.template slc<t_width>(0));
-    if(need_rem_bits)
-      r_pre_rnd[0] = sticky_bit;
-
-    bool shift_r1 = round(r_pre_rnd);
-    e_t = r_zero ? 0 : e_t + shift_r1;
-    if(!(e_t < 0) & !!(e_t >> E-1)) {
-      e = MAX_EXP;
-      m = m < 0 ? value<AC_VAL_MIN>(m) : value<AC_VAL_MAX>(m);
-    } else {
-      e = e_t;
-    }
-  }
-
-public:
-  template<AC_FL_T(2)>
-  ac_float(const AC_FL(2) &op, bool assert_on_overflow=false, bool assert_on_rounding=false) {
-    typedef AC_FL(2) fl2_t;
-    const int min_exp2 = fl2_t::MIN_EXP;
-    const int max_exp2 = fl2_t::MAX_EXP;
-    assign_from<min_exp2,max_exp2>(op.m, op.e, false, false);
-  }
-
-  ac_float(const ac_fixed<W,I,S> &m2, const ac_int<E,true> &e2, bool normalize=true) {
-    m = m2;
-    e = e2;
-    if(normalize)
-      this->normalize();
-    else
-      e &= ac_int<1,true>(!!m);
-  }
-
-  template<int WFX, int IFX, bool SFX, int E2>
-  ac_float(const ac_fixed<WFX,IFX,SFX> &m2, const ac_int<E2,true> &e2, bool normalize=true) {
-    enum { WF2 = WFX+!SFX, IF2 = IFX+!SFX };
-    ac_float<WF2,IF2,E2>  f(ac_fixed<WF2,IF2,true>(m2), e2, normalize);
-    *this = f;
-  }
-
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &op) {
-    assign_from<0,0>(ac_fixed<WFX+!SFX,IFX+!SFX,true>(op), 0, false, true);
-  }
-
-  template<int WI, bool SI>
-  ac_float(const ac_int<WI,SI> &op) {
-    *this = ac_fixed<WI,WI,SI>(op);
-  }
-
-  inline ac_float( bool b ) { *this = (ac_int<1,false>) b; }
-  inline ac_float( char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_float( signed char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_float( unsigned char b ) { *this = (ac_int<8,false>) b; }
-  inline ac_float( signed short b ) { *this = (ac_int<16,true>) b; }
-  inline ac_float( unsigned short b ) { *this = (ac_int<16,false>) b; }
-  inline ac_float( signed int b ) { *this = (ac_int<32,true>) b; }
-  inline ac_float( unsigned int b ) { *this = (ac_int<32,false>) b; }
-  inline ac_float( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
-  inline ac_float( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
-  inline ac_float( Slong b ) { *this = (ac_int<64,true>) b; }
-  inline ac_float( Ulong b ) { *this = (ac_int<64,false>) b; }
-
-  // Explicit conversion functions to ac_int and ac_fixed
-  inline typename rt_unary::to_ac_fixed_t to_ac_fixed() const {
-    typename rt_unary::to_ac_fixed_t r = m;
-    r <<= e;
-    return r;
-  }
-  inline typename rt_unary::to_ac_int_t to_ac_int() const {
-    return to_ac_fixed().to_ac_int();
-  }
-
-  // Explicit conversion functions to C built-in types -------------
-  inline int to_int() const { return to_ac_int().to_int(); }
-  inline unsigned to_uint() const { return to_ac_int().to_uint(); }
-  inline long to_long() const { return (signed long) to_ac_int().to_int64(); }
-  inline unsigned long to_ulong() const { return (unsigned long) to_ac_int().to_uint64(); }
-  inline Slong to_int64() const { return to_ac_int().to_int64(); }
-  inline Ulong to_uint64() const { return to_ac_int().to_uint64(); }
-  inline float to_float() const { return ldexpf(m.to_double(), exp()); }
-  inline double to_double() const { return ldexp(m.to_double(), exp()); }
-
-  const ac_fixed<W,I,S> mantissa() const { return m; }
-  const ac_int<E,true> exp() const { return e; }
-  bool normalize() {
-    bool all_sign;
-    int ls = m.leading_sign(all_sign);
-    bool m_zero = all_sign & !m[0];
-    const int max_shift_left = (1 << (E-1)) + e;
-    bool normal = ls <= max_shift_left;
-    int shift_l = normal ? ls : max_shift_left;
-    m <<= shift_l;
-    e = ac_int<1,true>(!m_zero) & (e - shift_l);
-    return normal;
-  }
-
-  ac_float( double d, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
-    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cdouble_t::width + I_EXT - 1,  };
-    ac_private::ac_float_cdouble_t t = ac_private::double_to_ac_float(d);
-    ac_float r(t, assert_on_overflow, assert_on_rounding);
-    *this = r;
-  }
-
-  ac_float( float f, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
-    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cfloat_t::width + I_EXT - 1,  };
-    ac_private::ac_float_cfloat_t t = ac_private::float_to_ac_float(f);
-    ac_float r(t, assert_on_overflow, assert_on_rounding);
-    *this = r;
-  }
-
-  template<AC_FL_T(2)>
-  bool compare(const AC_FL(2) &op2, bool *gt) const {
-    typedef ac_fixed<W2,I,S2> fx2_t;
-    typedef typename ac_fixed<W,I,S>::template rt_T< fx2_t >::logic fx_t;
-    typedef ac_fixed<fx_t::width,fx_t::i_width,false> fxu_t;
-
-    fx2_t op2_m_0;
-    op2_m_0.set_slc(0, op2.m.template slc<W2>(0));
-
-    fx_t op1_m = m;
-    fx_t op2_m = op2_m_0;
-    int e_dif = exp() - op2.exp() + I - I2;
-    bool op2_m_neg = op2_m[fx_t::width-1];
-    fx_t out_bits = op2_m ^ ((op2_m_neg & e_dif < 0) ? ~fx_t(0) : fx_t(0));
-    out_bits &= ~(fxu_t(~fxu_t(0)) << e_dif);
-    op2_m >>= e_dif;
-    bool overflow = e_dif < 0 & !!out_bits | op2_m_neg ^ op2_m[fx_t::width-1];
-
-    *gt = overflow & op2_m_neg | !overflow & op1_m > op2_m;
-    bool eq = op1_m == op2_m & !overflow & !out_bits;
-    return eq;
-  }
-
-  template<AC_FL_T(2), AC_FL_T(R)>
-  void plus_minus(const AC_FL(2) &op2, AC_FL(R) &r, bool sub=false) const {
-    typedef AC_FL(2) op2_t;
-    enum { IT = AC_MAX(I,I2) };
-    typedef ac_fixed<W, IT, true> fx1_t;
-    typedef ac_fixed<W2, IT, true> fx2_t;
-    // covers fx1_t and r mantissas (adds additional LSBs if WR > W)
-    typedef typename fx1_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx1r_t;
-    // covers fx2_t and r mantissas (adds additional LSBs if WR > W2)
-    typedef typename fx2_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx2r_t;
-    // mt_t adds one integer bit for the plus
-    //  op1_m, op2_m, op_sl, sticky_bits
-    typedef typename fx1r_t::template rt_T<fx2r_t>::plus mt_t;
-
-    const bool round_bit_needed = QR != AC_TRN;
-    const bool remaining_bits_needed = !(QR == AC_TRN || QR == AC_RND);
-
-    const int w_r_with_round_bits = WR + round_bit_needed;
-
-    // naming: sn = subnormal, n = normal, wc = worst case
-    // worst case (wc) normalize is when one operand has smallest subnormal
-    //   and other operand is shifted right so that its MSB lines up with LSB of subnormal
-    const int power_smallest_sn1 = I - W - (1 << (E-1));
-    const int power_smallest_sn2 = I2 - W2 - (1 << (E2-1));
-    const int power_smallest_sn_dif1 = AC_MAX(0,power_smallest_sn2 - power_smallest_sn1);
-    const int power_smallest_sn_dif2 = AC_MAX(0,power_smallest_sn1 - power_smallest_sn2);
-    const int wc_norm_shift1 = W2-1 + AC_MIN(power_smallest_sn_dif1, W-1);
-    const int wc_norm_shift2 = W-1 + AC_MIN(power_smallest_sn_dif2, W2-1);
-    const int wc_sn_norm_shift = AC_MAX(wc_norm_shift1, wc_norm_shift2);
-    const int w_sn_overlap = wc_sn_norm_shift + 1;
-
-    // cases when one operand is subnormal and other is shifted right and does not overlap bits
-    //   subnormal op could be normalized by width-1 bits
-    const int w_sn_no_overlap1 = W + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif2);
-    const int w_sn_no_overlap2 = W2 + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif1);
-    const int w_sn_no_overlap = AC_MAX(w_sn_no_overlap1, w_sn_no_overlap2);
-
-    const int w_sn = AC_MAX(w_sn_overlap, w_sn_no_overlap);
-
-    // For example 0100 + (1000 0001 >> 1) = 0000 0000 1,  wc_n_norm_shift = max(4,8)
-    const int msb0h1 = I-1 + (int) MAX_EXP;
-    const int msb1h1 = msb0h1-1;
-    const int msb0l1 = I-1 + (int) MIN_EXP;
-    const int msb1l1 = msb0h1-1;
-    const int msb0h2 = I2-1 + (int) op2_t::MAX_EXP;
-    const int msb1h2 = msb0h2-1;
-    const int msb0l2 = I2-1 + (int) op2_t::MIN_EXP;
-    const int msb1l2 = msb0h2-1;
-    // bit W-1 overlap with bit W2-2
-    const bool msb_overlap1 = msb1h2 >= msb0h1 && msb0h1 <= msb1l2
-      || msb1h2 >= msb0l1 && msb0l1 <= msb1l2
-      || msb0h1 >= msb1h2 && msb1h2 >= msb0l1;
-    // bit W2-1 overlap with bit W1-2
-    const bool msb_overlap2 = msb1h1 >= msb0h2 && msb0h2 <= msb1l1
-      || msb1h1 >= msb0l2 && msb0l2 <= msb1l1
-      || msb0h2 >= msb1h1 && msb1h1 >= msb0l2;
-    const bool msb_overlap = msb_overlap1 || msb_overlap2;
-    const int wc_n_norm_shift = AC_MAX(W,W2);
-    const int w_n_msb_overlap = msb_overlap ? wc_n_norm_shift + 1 : 0;
-    // addition of two numbers of different sign can result in a normalization by 1 (therefore + 1)
-    const int w_n_no_msb_overlap = w_r_with_round_bits + 1;
-    const int w_n = AC_MAX(w_n_msb_overlap, w_n_no_msb_overlap);
-
-    // +1 is to prevent overflow during addition
-    const int tr_t_width = AC_MAX(w_n, w_sn) + 1;
-    typedef ac_fixed<tr_t_width,IT+1,true> add_t;
-
-    const int min_E = (int) MIN_EXP + I-IT;
-    const int min_E2 = (int) AC_FL(2)::MIN_EXP + I2-IT;
-    const int min_ET = AC_MIN(min_E, min_E2);
-
-    const int max_E = (int) MAX_EXP + I-IT;
-    const int max_E2 = (int) AC_FL(2)::MAX_EXP + I2-IT;
-    const int max_ET = AC_MAX(max_E, max_E2);
-
-    ac_fixed<mt_t::width, I+1, mt_t::sign> op1_m_0 = m;
-    mt_t op1_m = 0;
-    op1_m.set_slc(0, op1_m_0.template slc<mt_t::width>(0));
-    int op1_e = exp() + I-IT;
-
-    ac_fixed<mt_t::width, I2+1, mt_t::sign> op2_m_0 = op2.m;
-    mt_t op2_m = 0;
-    op2_m.set_slc(0, op2_m_0.template slc<mt_t::width>(0));
-    if(sub)
-      op2_m = -op2_m;
-    int op2_e = op2.exp() + I2-IT;
-
-    bool op1_zero = operator !();
-    bool op2_zero = !op2;
-    int e_dif = op1_e - op2_e;
-    bool e1_lt_e2 = e_dif < 0;
-    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
-
-    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
-    mt_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
-
-    bool sticky_bit = false;
-    if(remaining_bits_needed) {
-      mt_t shifted_out_bits = op_lshift;
-      // bits that are shifted out of a add_t (does not include potential 3 spare bits)
-      shifted_out_bits &= ~((~add_t(0)) << e_dif);
-      sticky_bit = !!shifted_out_bits;
-    }
-    op_lshift >>= e_dif;
-
-    add_t add_r = op_lshift + op_no_shift;
-    int e_t = (e1_lt_e2 & !op2_zero | op1_zero ? op2_e : op1_e);
-
-    r.template assign_from<min_ET,max_ET>(add_r, e_t, sticky_bit, true);
-  }
-
-  template<AC_FL_T(1), AC_FL_T(2)>
-  ac_float add(const AC_FL(1) &op1, const AC_FL(2) &op2) {
-    op1.plus_minus(op2, *this);
-    return *this;
-  }
-
-  template<AC_FL_T(1), AC_FL_T(2)>
-  ac_float sub(const AC_FL(1) &op1, const AC_FL(2) &op2) {
-    op1.plus_minus(op2, *this, true);
-    return *this;
-  }
-
-  typename rt_unary::neg abs() const {
-    typedef typename rt_unary::neg r_t;
-    r_t r;
-    r.m = is_neg() ? -m : r_t::mant_t(m);
-    r.e = e;
-    return r;
-  }
-
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-  // These will be changed!!! For now only enable to explore integration with ac_complex
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::plus operator +(const AC_FL(2) &op2) const {
-    typename rt< AC_FL_TV0(2) >::plus r;
-    plus_minus(op2, r);
-    return r;
-  }
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::minus operator -(const AC_FL(2) &op2) const {
-    typename rt< AC_FL_TV0(2) >::minus r;
-    plus_minus(op2, r, true);
-    return r;
-  }
-#endif
-
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::mult operator *(const AC_FL(2) &op2) const {
-    typedef typename rt< AC_FL_TV0(2) >::mult r_t;
-    r_t r(m*op2.m, exp()+op2.exp(), false);
-    return r;
-  }
-
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::div operator /(const AC_FL(2) &op2) const {
-    typename rt< AC_FL_TV0(2) >::div r(m/op2.m, exp()-op2.exp());
-    return r;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator +=(const AC_FL(2) &op2) {
-    ac_float r;
-    plus_minus(op2, r);
-    *this = r;
-    return *this;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator -=(const AC_FL(2) &op2) {
-    ac_float r;
-    plus_minus(op2, r, true);
-    *this = r;
-    return *this;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator *=(const AC_FL(2) &op2) {
-    *this = *this * op2;
-    return *this;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator /=(const AC_FL(2) &op2) {
-    *this = *this / op2;
-    return *this;
-  }
-  ac_float operator + () const {
-    return *this;
-  }
-  typename rt_unary::neg operator - () const {
-    typename rt_unary::neg r;
-    r.m = -m;
-    r.e = e;
-    return r;
-  }
-  bool operator ! () const {
-    return !m;
-  }
-
-  // Shift --------------------------------------------------------------------
-  template<int WI, bool SI>
-  typename rt_i<WI,SI>::lshift operator << ( const ac_int<WI,SI> &op2 ) const {
-    typename rt_i<WI,SI>::lshift r;
-    r.m = m;
-    r.e = e + op2;
-    return r;
-  }
-  template<int WI, bool SI>
-  typename rt_i<WI,SI>::rshift operator >> ( const ac_int<WI,SI> &op2 ) const {
-    typename rt_i<WI,SI>::rshift r;
-    r.m = m;
-    r.e = e - op2;
-    return r;
-  }
-  // Shift assign -------------------------------------------------------------
-  template<int WI, bool SI>
-  ac_float &operator <<= ( const ac_int<WI,SI> &op2 ) {
-    *this = operator << (op2);
-    return *this;
-  }
-  template<int WI, bool SI>
-  ac_float &operator >>= ( const ac_int<WI,SI> &op2 ) {
-    *this = operator >> (op2);
-    return *this;
-  }
-
-  template<AC_FL_T(2)>
-  bool operator == (const AC_FL(2) &f) const {
-    bool gt;
-    return compare(f, &gt);
-  }
-  template<AC_FL_T(2)>
-  bool operator != (const AC_FL(2) &f) const {
-    return !operator == (f);
-  }
-  template<AC_FL_T(2)>
-  bool operator < (const AC_FL(2) &f) const {
-    bool gt;
-    bool eq = compare(f, &gt);
-    return !(eq | gt);
-  }
-  template<AC_FL_T(2)>
-  bool operator >= (const AC_FL(2) &f) const {
-    return !operator < (f);
-  }
-  template<AC_FL_T(2)>
-  bool operator > (const AC_FL(2) &f) const {
-    bool gt;
-    compare(f, &gt);
-    return gt;
-  }
-  template<AC_FL_T(2)>
-  bool operator <= (const AC_FL(2) &f) const {
-    return !operator > (f);
-  }
-
-  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false, bool hw=true) const {
-    // TODO: printing decimal with exponent
-    if(!hw) {
-      ac_fixed<W,0,S> mantissa;
-      mantissa.set_slc(0, m.template slc<W>(0));
-      std::string r = mantissa.to_string(base_rep, sign_mag);
-      r += "e2";
-      r += (e + I).to_string(base_rep, sign_mag | base_rep == AC_DEC);
-      return r;
-    } else {
-      std::string r = m.to_string(base_rep, sign_mag);
-      if(base_rep != AC_DEC)
-        r += "_";
-      r += "e2";
-      if(base_rep != AC_DEC)
-        r += "_";
-      if(E)
-        r += e.to_string(base_rep, sign_mag | base_rep == AC_DEC);
-      else
-        r += "0";
-      return r;
-    }
-  }
-
-  inline static std::string type_name() {
-    const char *tf[] = {"false", "true" };
-    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV" };
-    std::string r = "ac_float<";
-    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
-    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
-    r += ac_int<32,true>(E).to_string(AC_DEC) + ',';
-    r += tf[S];
-    r += ',';
-    r += q[Q];
-    r += '>';
-    return r;
-  }
-
-  template<ac_special_val V>
-  inline ac_float &set_val() {
-    m.template set_val<V>();
-    if(V == AC_VAL_MIN)
-      e.template set_val<AC_VAL_MAX>();
-    else if(V == AC_VAL_QUANTUM)
-      e.template set_val<AC_VAL_MIN>();
-    else
-      e.template set_val<V>();
-    return *this;
-  }
-};
-
-namespace ac_private {
-  template<typename T>
-  bool ac_fpclassify(T x, bool &inf) {
-    bool nan = !(x==x);
-    if(!nan) {
-      T d = x - x;
-      inf = !(d==d);
-    }
-    return nan;
-  }
-
-  inline ac_float_cdouble_t double_to_ac_float(double d) {
-    typedef ac_float_cdouble_t r_t;
-#ifndef __SYNTHESIS__
-    bool inf;
-    bool nan = ac_fpclassify(d, inf);
-    if(nan)
-      AC_ASSERT(0, "In conversion from double to ac_float: double is NaN");
-    else if(inf)
-      AC_ASSERT(0, "In conversion from double to ac_float: double is Infinite");
-#endif
-    r_t::exp_t exp;
-    r_t::mant_t mant = ac::frexp_d(d, exp);
-    return r_t(mant, exp, false);
-  }
-
-  inline ac_float_cfloat_t float_to_ac_float(float f) {
-    typedef ac_float_cfloat_t r_t;
-#ifndef __SYNTHESIS__
-    bool inf;
-    bool nan = ac_fpclassify(f, inf);
-    if(nan)
-      AC_ASSERT(0, "In conversion from float to ac_float: float is NaN");
-    else if(inf)
-      AC_ASSERT(0, "In conversion from float to ac_float: float is Infinite");
-#endif
-    r_t::exp_t exp;
-    r_t::mant_t mant = ac::frexp_f(f, exp);
-    return r_t(mant, exp, false);
-  }
-};
-
-namespace ac {
-  template<typename T>
-  struct ac_float_represent {
-    typedef typename ac_fixed_represent<T>::type fx_t;
-    typedef ac_float<fx_t::width+!fx_t::sign,fx_t::i_width+!fx_t::sign,1,fx_t::q_mode> type;
-  };
-  template<> struct ac_float_represent<float> {
-    typedef ac_private::ac_float_cfloat_t type;
-  };
-  template<> struct ac_float_represent<double> {
-    typedef ac_private::ac_float_cdouble_t type;
-  };
-}
-
-namespace ac_private {
-  // with T == ac_float
-  template< AC_FL_T0(2) >
-  struct rt_ac_float_T< AC_FL0(2) > {
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-  // with T == ac_fixed
-  template<int WFX, int IFX, bool SFX>
-  struct rt_ac_float_T< ac_fixed<WFX,IFX,SFX> > {
-    // For now E2 > 0
-    enum { E2 = 1, S2 = true, W2 = WFX + !SFX, I2 = IFX + !SFX };
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-  // with T == ac_int
-  template<int WI, bool SI>
-  struct rt_ac_float_T< ac_int<WI,SI> > {
-    // For now E2 > 0
-    enum { E2 = 1, S2 = true, I2 = WI + !SI, W2 = I2 };
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-
-  // Multiplication is optimizable, general operator +/- is not yet supported
-  template<typename T>
-  struct rt_ac_float_T< c_type<T> > {
-    // For now E2 > 0
-    enum { SCT = c_type_params<T>::S, S2 = true, W2 = c_type_params<T>::W + !SCT, I2 = c_type_params<T>::I + !SCT, E2 = AC_MAX(1, c_type_params<T>::E) };
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-}
-
-// Stream --------------------------------------------------------------------
-
-#ifndef __SYNTHESIS__
-template<AC_FL_T()>
-inline std::ostream& operator << (std::ostream &os, const AC_FL() &x) {
-  os << x.to_string(AC_DEC);
-  return os;
-}
-#endif
-
-#define FL_BIN_OP_WITH_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
-  template< AC_FL_T() > \
-  inline typename AC_FL()::template rt_T2<C_TYPE>::RTYPE operator BIN_OP ( C_TYPE c_op, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return fl2_t(c_op).operator BIN_OP (op);  \
-  } \
-  template< AC_FL_T() > \
-  inline typename AC_FL()::template rt_T<C_TYPE>::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE c_op) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return op.operator BIN_OP (fl2_t(c_op));  \
-  }
-
-#define FL_REL_OP_WITH_CTYPE(REL_OP, C_TYPE)  \
-  template< AC_FL_T() > \
-  inline bool operator REL_OP ( const AC_FL() &op, C_TYPE op2) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return op.operator REL_OP (fl2_t(op2));  \
-  }  \
-  template< AC_FL_T() > \
-  inline bool operator REL_OP ( C_TYPE op2, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return fl2_t(op2).operator REL_OP (op);  \
-  }
-
-#define FL_ASSIGN_OP_WITH_CTYPE_2(ASSIGN_OP, C_TYPE)  \
-  template< AC_FL_T() > \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE op2) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return op.operator ASSIGN_OP (fl2_t(op2));  \
-  }
-
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
-  FL_BIN_OP_WITH_CTYPE(+, C_TYPE, plus) \
-  FL_BIN_OP_WITH_CTYPE(-, C_TYPE, minus)
-#else
-#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE)
-#endif
-
-#define FL_OPS_WITH_CTYPE(C_TYPE) \
-  FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
-  FL_BIN_OP_WITH_CTYPE(*, C_TYPE, mult) \
-  FL_BIN_OP_WITH_CTYPE(/, C_TYPE, div) \
-  \
-  FL_REL_OP_WITH_CTYPE(==, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(!=, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(>, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(>=, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(<, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(<=, C_TYPE) \
-  \
-  FL_ASSIGN_OP_WITH_CTYPE_2(+=, C_TYPE) \
-  FL_ASSIGN_OP_WITH_CTYPE_2(-=, C_TYPE) \
-  FL_ASSIGN_OP_WITH_CTYPE_2(*=, C_TYPE) \
-  FL_ASSIGN_OP_WITH_CTYPE_2(/=, C_TYPE)
-
-#define FL_SHIFT_OP_WITH_INT_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
-  template< AC_FL_T() > \
-  inline typename AC_FL()::template rt_i< ac_private::c_type_params<C_TYPE>::W, ac_private::c_type_params<C_TYPE>::S >::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE i_op) {  \
-    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
-    return op.operator BIN_OP (i_t(i_op));  \
-  }
-
-#define FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(ASSIGN_OP, C_TYPE)  \
-  template< AC_FL_T() > \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE i_op) {  \
-    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
-    return op.operator ASSIGN_OP (i_t(i_op));  \
-  }
-
-#define FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE) \
-  FL_SHIFT_OP_WITH_INT_CTYPE(>>, C_TYPE, rshift) \
-  FL_SHIFT_OP_WITH_INT_CTYPE(<<, C_TYPE, lshift) \
-  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(>>=, C_TYPE) \
-  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(<<=, C_TYPE)
-
-#define FL_OPS_WITH_INT_CTYPE(C_TYPE) \
-  FL_OPS_WITH_CTYPE(C_TYPE) \
-  FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE)
-
-// --------------------------------------- End of Macros for Binary Operators with C Floats
-
-    // Binary Operators with C Floats --------------------------------------------
-    FL_OPS_WITH_CTYPE(float)
-    FL_OPS_WITH_CTYPE(double)
-    FL_OPS_WITH_INT_CTYPE(bool)
-    FL_OPS_WITH_INT_CTYPE(char)
-    FL_OPS_WITH_INT_CTYPE(signed char)
-    FL_OPS_WITH_INT_CTYPE(unsigned char)
-    FL_OPS_WITH_INT_CTYPE(short)
-    FL_OPS_WITH_INT_CTYPE(unsigned short)
-    FL_OPS_WITH_INT_CTYPE(int)
-    FL_OPS_WITH_INT_CTYPE(unsigned int)
-    FL_OPS_WITH_INT_CTYPE(long)
-    FL_OPS_WITH_INT_CTYPE(unsigned long)
-    FL_OPS_WITH_INT_CTYPE(Slong)
-    FL_OPS_WITH_INT_CTYPE(Ulong)
-    // -------------------------------------- End of Binary Operators with C Floats
-
-// Macros for Binary Operators with ac_int --------------------------------------------
-
-#define FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return fl2_t(i_op).operator BIN_OP (op);  \
-  }
-
-#define FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_int<WI,SI> &i_op) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return op.operator BIN_OP (fl2_t(i_op));  \
-  }
-
-#define FL_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
-  FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
-  FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
-
-#define FL_REL_OP_WITH_AC_INT(REL_OP)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline bool operator REL_OP ( const AC_FL() &op, const ac_int<WI,SI> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return op.operator REL_OP (fl2_t(op2));  \
-  }  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return fl2_t(op2).operator REL_OP (op);  \
-  }
-
-#define FL_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_int<WI,SI> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return op.operator ASSIGN_OP (fl2_t(op2));  \
-  }
-
-// -------------------------------------------- End of Macros for Binary Operators with ac_int
-
-    // Binary Operators with ac_int --------------------------------------------
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-    FL_BIN_OP_WITH_AC_INT(+, plus)
-    FL_BIN_OP_WITH_AC_INT(-, minus)
-#endif
-    FL_BIN_OP_WITH_AC_INT(*, mult)
-    FL_BIN_OP_WITH_AC_INT(/, div)
-
-    FL_REL_OP_WITH_AC_INT(==)
-    FL_REL_OP_WITH_AC_INT(!=)
-    FL_REL_OP_WITH_AC_INT(>)
-    FL_REL_OP_WITH_AC_INT(>=)
-    FL_REL_OP_WITH_AC_INT(<)
-    FL_REL_OP_WITH_AC_INT(<=)
-
-    FL_ASSIGN_OP_WITH_AC_INT(+=)
-    FL_ASSIGN_OP_WITH_AC_INT(-=)
-    FL_ASSIGN_OP_WITH_AC_INT(*=)
-    FL_ASSIGN_OP_WITH_AC_INT(/=)
-    FL_ASSIGN_OP_WITH_AC_INT(%=)
-    // -------------------------------------- End of Binary Operators with ac_int
-
-// Macros for Binary Operators with ac_fixed --------------------------------------------
-
-#define FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const ac_fixed<WF,IF,SF,QF,OF> &f_op, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return fl2_t(f_op).operator BIN_OP (op);  \
-  }
-
-#define FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &f_op) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return op.operator BIN_OP (fl2_t(f_op));  \
-  }
-
-#define FL_BIN_OP_WITH_AC_FIXED(BIN_OP, RTYPE)  \
-  FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE) \
-  FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)
-
-#define FL_REL_OP_WITH_AC_FIXED(REL_OP)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline bool operator REL_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return op.operator REL_OP (fl2_t(op2));  \
-  }  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline bool operator REL_OP ( ac_fixed<WF,IF,SF,QF,OF> &op2, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return fl2_t(op2).operator REL_OP (op);  \
-  }
-
-#define FL_ASSIGN_OP_WITH_AC_FIXED(ASSIGN_OP)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return op.operator ASSIGN_OP (fl2_t(op2));  \
-  }
-
-// -------------------------------------------- End of Macros for Binary Operators with ac_fixed
-
-    // Binary Operators with ac_fixed --------------------------------------------
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-    FL_BIN_OP_WITH_AC_FIXED(+, plus)
-    FL_BIN_OP_WITH_AC_FIXED(-, minus)
-#endif
-    FL_BIN_OP_WITH_AC_FIXED(*, mult)
-    FL_BIN_OP_WITH_AC_FIXED(/, div)
-
-    FL_REL_OP_WITH_AC_FIXED(==)
-    FL_REL_OP_WITH_AC_FIXED(!=)
-    FL_REL_OP_WITH_AC_FIXED(>)
-    FL_REL_OP_WITH_AC_FIXED(>=)
-    FL_REL_OP_WITH_AC_FIXED(<)
-    FL_REL_OP_WITH_AC_FIXED(<=)
-
-    FL_ASSIGN_OP_WITH_AC_FIXED(+=)
-    FL_ASSIGN_OP_WITH_AC_FIXED(-=)
-    FL_ASSIGN_OP_WITH_AC_FIXED(*=)
-    FL_ASSIGN_OP_WITH_AC_FIXED(/=)
-    // -------------------------------------- End of Binary Operators with ac_fixed
-
-// Global templatized functions for easy initialization to special values
-template<ac_special_val V, AC_FL_T()>
-inline AC_FL() value( AC_FL() ) {
-  AC_FL() r;
-  return r.template set_val<V>();
-}
-
-namespace ac {
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, AC_FL_T() >
-  inline bool init_array( AC_FL() *a, int n) {
-    AC_FL0() t;
-    t.template set_val<V>();
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_FLOAT_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2013-2019, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+//  Source:         ac_float.h
+//  Description:    class for floating point operation handling in C++
+//  Author:         Andres Takach, Ph.D.
+
+#ifndef __AC_FLOAT_H
+#define __AC_FLOAT_H
+
+#include <ac_fixed.h>
+
+#ifndef __SYNTHESIS__
+#include <cmath>
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
+#error GCC version 3 or greater is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
+#error Microsoft Visual Studio 8 or newer is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4003 4127 4308 4365 4514 4800 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wparentheses"
+#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
+#pragma clang diagnostic ignored "-Wbitwise-op-parentheses"
+#endif
+
+// for safety
+#if (defined(E) || defined(WF) || defined(IF) || defined(SF))
+#error One or more of the following is defined: E, WF, IF, SF. Definition conflicts with their usage as template parameters.
+#error DO NOT use defines before including third party header files.
+#endif
+
+#define AC_FL(v) ac_float<W##v,I##v,E##v,Q##v>
+#define AC_FL0(v) ac_float<W##v,I##v,E##v>
+#define AC_FL_T(v) int W##v, int I##v, int E##v, ac_q_mode Q##v
+#define AC_FL_TV(v) W##v, I##v, E##v, Q##v
+#define AC_FL_T0(v) int W##v, int I##v, int E##v
+#define AC_FL_TV0(v) W##v, I##v, E##v
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+template<int W, int I, int E, ac_q_mode Q=AC_TRN> class ac_float;
+
+namespace ac_private {
+
+  typedef ac_float<54,2,11> ac_float_cdouble_t;
+  typedef ac_float<25,2,8> ac_float_cfloat_t;
+
+  template<typename T>
+  struct rt_ac_float_T {
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename T::template rt_T<fl_t>::mult mult;
+      typedef typename T::template rt_T<fl_t>::plus plus;
+      typedef typename T::template rt_T<fl_t>::minus2 minus;
+      typedef typename T::template rt_T<fl_t>::minus minus2;
+      typedef typename T::template rt_T<fl_t>::logic logic;
+      typedef typename T::template rt_T<fl_t>::div2 div;
+      typedef typename T::template rt_T<fl_t>::div div2;
+    };
+  };
+  // specializations after definition of ac_float
+
+  inline ac_float_cdouble_t double_to_ac_float(double d);
+  inline ac_float_cfloat_t float_to_ac_float(float f);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//  ac_float
+//////////////////////////////////////////////////////////////////////////////
+
+template< AC_FL_T() >
+class ac_float {
+  enum { NO_UN = true, S = true, S2 = true, SR = true };
+public:
+  typedef ac_fixed<W,I,S> mant_t;
+  typedef ac_int<E,true> exp_t;
+  mant_t m;
+  exp_t e;
+
+  void set_mantissa(const ac_fixed<W,I,S> &man) { m = man; }
+  void set_exp(const ac_int<E,true> &exp) { if(E) e = exp; }
+
+private:
+  inline bool is_neg() const { return m < 0; }   // is_neg would be more efficient
+
+  enum {NZ_E = !!E, MIN_EXP = -(NZ_E << (E-NZ_E)), MAX_EXP = (1 << (E-NZ_E))-1};
+
+public:
+  static const int width = W;
+  static const int i_width = I;
+  static const int e_width = E;
+  static const bool sign = S;
+  static const ac_q_mode q_mode = Q;
+  static const ac_o_mode o_mode = AC_SAT;
+
+  template< AC_FL_T0(2) >
+  struct rt {
+    enum {
+      // need to validate
+      F=W-I,
+      F2=W2-I2,
+      mult_w = W+W2,
+      mult_i = I+I2,
+      mult_e = AC_MAX(E,E2)+1,
+      mult_s = S||S2,
+      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      plus_e = AC_MAX(E,E2),
+      plus_s = S||S2,
+      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      minus_e = AC_MAX(E,E2),
+      minus_s = true,
+      div_w = W+AC_MAX(W2-I2,0)+S2,
+      div_i = I+(W2-I2)+S2,
+      div_e = AC_MAX(E,E2)+1,
+      div_s = S||S2,
+      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
+      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
+      logic_s = S||S2,
+      logic_e = AC_MAX(E,E2)
+    };
+    typedef ac_float<mult_w, mult_i, mult_e> mult;
+    typedef ac_float<plus_w, plus_i, plus_e> plus;
+    typedef ac_float<minus_w, minus_i, minus_e> minus;
+    typedef ac_float<logic_w, logic_i, logic_e> logic;
+    typedef ac_float<div_w, div_i, div_e> div;
+    typedef ac_float arg1;
+
+  };
+
+  template<int WI, bool SI>
+  struct rt_i {
+    enum {
+      lshift_w = W,
+      lshift_i = I,
+      lshift_s = S,
+      lshift_e_0 = exp_t::template rt<WI,SI>::plus::width,
+      lshift_e = AC_MIN(lshift_e_0, 24),
+      rshift_w = W,
+      rshift_i = I,
+      rshift_s = S,
+      rshift_e_0 = exp_t::template rt<WI,SI>::minus::width,
+      rshift_e = AC_MIN(rshift_e_0, 24)
+    };
+    typedef ac_float<lshift_w, lshift_i, lshift_e> lshift;
+    typedef ac_float<rshift_w, rshift_i, rshift_e> rshift;
+  };
+
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus2;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div2;
+    typedef ac_float arg1;
+  };
+
+  template<typename T>
+  struct rt_T2 {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus2;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div2;
+    typedef ac_float arg1;
+  };
+
+  struct rt_unary {
+    enum {
+      neg_w = W+1,
+      neg_i = I+1,
+      neg_e = E,
+      neg_s = true,
+      mag_sqr_w = 2*W-S + NO_UN,
+      mag_sqr_i = 2*I-S + NO_UN,
+      mag_sqr_e = E,
+      mag_sqr_s = false | NO_UN,
+      mag_w = W+S + NO_UN,
+      mag_i = I+S + NO_UN,
+      mag_e = E,
+      mag_s = false | NO_UN,
+      to_fx_i = I + MAX_EXP,
+      to_fx_w = W + MAX_EXP - MIN_EXP,
+      to_fx_s = S,
+      to_i_w = AC_MAX(to_fx_i,1),
+      to_i_s = S
+    };
+    typedef ac_float<neg_w, neg_i, neg_e> neg;
+    typedef ac_float<mag_sqr_w, mag_sqr_i, mag_sqr_e> mag_sqr;
+    typedef ac_float<mag_w, mag_i, mag_e> mag;
+    template<unsigned N>
+    struct set {
+      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_e = E, sum_s = S};
+      typedef ac_float<sum_w, sum_i, sum_e> sum;
+    };
+    typedef ac_fixed<to_fx_w, to_fx_i, to_fx_s> to_ac_fixed_t;
+    typedef ac_int<to_i_w, to_i_s> to_ac_int_t;
+  };
+
+  template<AC_FL_T(2)> friend class ac_float;
+
+  ac_float() {
+#if defined(AC_DEFAULT_IN_RANGE)
+#endif
+  }
+  ac_float(const ac_float &op) {
+    m = op.m;
+    e = op.e;
+  }
+
+private:
+  template<int W2>
+  bool round(const ac_fixed<W2,I,true> &op2, bool assert_on_rounding=false) {
+    const bool rnd = Q!=AC_TRN && Q!=AC_TRN_ZERO && W2 > W;
+    bool rnd_ovfl = false;
+    m = 0;
+    if(rnd) {
+      ac_fixed<W+1,I+1,true,Q> m_1 = op2;
+      // overflow because of rounding would lead to go from 001111  to 01000 (extra bit prevents it)
+      //   change from 01000 to 00100 and store 0100 in m
+      rnd_ovfl = !m_1[W] & m_1[W-1];
+      m_1[W-1] = m_1[W-1] & !rnd_ovfl;
+      m_1[W-2] = m_1[W-2] | rnd_ovfl;
+      m.set_slc(0, m_1.template slc<W>(0));
+      if(assert_on_rounding)
+        AC_ASSERT(m == op2, "Loss of precision due to Rounding");
+      return rnd_ovfl;
+    } else {
+      ac_fixed<W,I,true,Q> m_0 = op2;
+      m.set_slc(0, m_0.template slc<W>(0));
+      return false;
+    }
+  }
+
+  template<int min_exp2, int max_exp2, int W2, int I2, ac_q_mode Q2, ac_o_mode O2>
+  void assign_from(const ac_fixed<W2,I2,true,Q2,O2> &m2, int e2, bool sticky_bit, bool normalize, bool assert_on_rounding=false) {
+    const bool rnd = Q!=AC_TRN & Q!=AC_TRN_ZERO & W2 > W;
+    const bool need_rnd_bit = Q != AC_TRN;
+    const bool need_rem_bits = need_rnd_bit && Q != AC_RND;
+
+    const int msb_min_power = I-1 + MIN_EXP;
+    const int msb_min_power2 = I2-1 + min_exp2;
+    const int msb_min_power_dif = msb_min_power - msb_min_power2;
+    //   if > 0: target has additional negative exponent range
+    //     subnormal maybe be further normalized (done even if normalize==false)
+    //   if < 0: target has less negative exponent range
+    //     mantissa may need to be shifted right
+    //   in either case if source is unnormalized
+    //     normalization could take place
+
+    const int msb_max_power = I-1 + MAX_EXP;
+    const int msb_max_power2 = I2-1 + max_exp2 + rnd;
+    const int msb_max_power_dif = msb_max_power - msb_max_power2;
+
+    const bool may_shift_right = msb_min_power_dif > 0;
+    const int max_right_shift = may_shift_right ? msb_min_power_dif : 0;
+    const int t_width = W2 + (W >= W2 ? AC_MIN(W-W2+may_shift_right, max_right_shift) : 0);
+
+    int e_t = e2;
+    e_t += I2-I;
+    typedef ac_fixed<t_width,I2,true,Q2,O2> op2_t;
+    op2_t op2 = m2;
+    int ls = 0;
+    bool r_zero;
+    if(normalize) {
+      bool all_sign;
+      ls = m2.leading_sign(all_sign);
+      r_zero = all_sign & !m2[0];
+    } else if(msb_min_power_dif < 0 || msb_max_power_dif < 0 || W2 > W) {
+      // msb_min_power_dif < 0: src exponent less negative than trg exp represents
+      //   oportunity to further normalize value in trg representation
+      // msb_max_power_dif < 0: max target exp is less than max src exp
+      //   if un-normalized exp may overflow resulting in incorrect saturation
+      //     normalization is needed for correctness
+      // W2 > W
+      //   if un-normalized, extra bits may be incorrectly quantized away
+      const int msb_range_dif = AC_MAX(-msb_min_power_dif, -msb_max_power_dif);
+      const int msb_range_dif_norm_w = AC_MIN(msb_range_dif,W2-1);
+      const int extra_bits = AC_MAX(W2-W,0);
+      const int norm_w = AC_MAX(msb_range_dif_norm_w, extra_bits) + 1;
+      bool all_sign;
+      ls = m2.template slc<norm_w>(W2-norm_w).leading_sign(all_sign);
+      r_zero = all_sign & !m2[W2-1] & !(m2 << norm_w);
+    } else {
+      r_zero = !m2;
+    }
+    int actual_max_shift_left = (1 << (E-1)) + e_t;
+    if(may_shift_right && actual_max_shift_left < 0) {
+      const int shift_r_w = ac::nbits<max_right_shift>::val;
+      ac_int<shift_r_w,false> shift_r = -actual_max_shift_left;
+      if((1 << (E-1)) + min_exp2 + I2-I < 0 && need_rem_bits) {
+        op2_t shifted_out_bits = op2;
+        shifted_out_bits &= ~((~op2_t(0)) << shift_r);
+        sticky_bit |= !!shifted_out_bits;
+      }
+      op2 >>= shift_r;
+      e_t += shift_r;
+    } else {
+      bool shift_exponent_limited = ls >= actual_max_shift_left;
+      int shift_l = shift_exponent_limited ? actual_max_shift_left : (int) ls;
+      op2 <<= shift_l;
+      e_t = shift_exponent_limited ? MIN_EXP : e_t - ls;
+    }
+    ac_fixed<t_width+need_rem_bits,I,true> r_pre_rnd = 0;
+    r_pre_rnd.set_slc(need_rem_bits, op2.template slc<t_width>(0));
+    if(need_rem_bits)
+      r_pre_rnd[0] = sticky_bit;
+
+    bool shift_r1 = round(r_pre_rnd);
+    e_t = r_zero ? 0 : e_t + shift_r1;
+    if(!(e_t < 0) & !!(e_t >> E-1)) {
+      e = MAX_EXP;
+      m = m < 0 ? value<AC_VAL_MIN>(m) : value<AC_VAL_MAX>(m);
+    } else {
+      e = e_t;
+    }
+  }
+
+public:
+  template<AC_FL_T(2)>
+  ac_float(const AC_FL(2) &op, bool assert_on_overflow=false, bool assert_on_rounding=false) {
+    typedef AC_FL(2) fl2_t;
+    const int min_exp2 = fl2_t::MIN_EXP;
+    const int max_exp2 = fl2_t::MAX_EXP;
+    assign_from<min_exp2,max_exp2>(op.m, op.e, false, false);
+  }
+
+  ac_float(const ac_fixed<W,I,S> &m2, const ac_int<E,true> &e2, bool normalize=true) {
+    m = m2;
+    e = e2;
+    if(normalize)
+      this->normalize();
+    else
+      e &= ac_int<1,true>(!!m);
+  }
+
+  template<int WFX, int IFX, bool SFX, int E2>
+  ac_float(const ac_fixed<WFX,IFX,SFX> &m2, const ac_int<E2,true> &e2, bool normalize=true) {
+    enum { WF2 = WFX+!SFX, IF2 = IFX+!SFX };
+    ac_float<WF2,IF2,E2>  f(ac_fixed<WF2,IF2,true>(m2), e2, normalize);
+    *this = f;
+  }
+
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &op) {
+    assign_from<0,0>(ac_fixed<WFX+!SFX,IFX+!SFX,true>(op), 0, false, true);
+  }
+
+  template<int WI, bool SI>
+  ac_float(const ac_int<WI,SI> &op) {
+    *this = ac_fixed<WI,WI,SI>(op);
+  }
+
+  inline ac_float( bool b ) { *this = (ac_int<1,false>) b; }
+  inline ac_float( char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_float( signed char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_float( unsigned char b ) { *this = (ac_int<8,false>) b; }
+  inline ac_float( signed short b ) { *this = (ac_int<16,true>) b; }
+  inline ac_float( unsigned short b ) { *this = (ac_int<16,false>) b; }
+  inline ac_float( signed int b ) { *this = (ac_int<32,true>) b; }
+  inline ac_float( unsigned int b ) { *this = (ac_int<32,false>) b; }
+  inline ac_float( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
+  inline ac_float( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
+  inline ac_float( Slong b ) { *this = (ac_int<64,true>) b; }
+  inline ac_float( Ulong b ) { *this = (ac_int<64,false>) b; }
+
+  // Explicit conversion functions to ac_int and ac_fixed
+  inline typename rt_unary::to_ac_fixed_t to_ac_fixed() const {
+    typename rt_unary::to_ac_fixed_t r = m;
+    r <<= e;
+    return r;
+  }
+  inline typename rt_unary::to_ac_int_t to_ac_int() const {
+    return to_ac_fixed().to_ac_int();
+  }
+
+  // Explicit conversion functions to C built-in types -------------
+  inline int to_int() const { return to_ac_int().to_int(); }
+  inline unsigned to_uint() const { return to_ac_int().to_uint(); }
+  inline long to_long() const { return (signed long) to_ac_int().to_int64(); }
+  inline unsigned long to_ulong() const { return (unsigned long) to_ac_int().to_uint64(); }
+  inline Slong to_int64() const { return to_ac_int().to_int64(); }
+  inline Ulong to_uint64() const { return to_ac_int().to_uint64(); }
+  inline float to_float() const { return ldexpf(m.to_double(), exp()); }
+  inline double to_double() const { return ldexp(m.to_double(), exp()); }
+
+  const ac_fixed<W,I,S> mantissa() const { return m; }
+  const ac_int<E,true> exp() const { return e; }
+  bool normalize() {
+    bool all_sign;
+    int ls = m.leading_sign(all_sign);
+    bool m_zero = all_sign & !m[0];
+    const int max_shift_left = (1 << (E-1)) + e;
+    bool normal = ls <= max_shift_left;
+    int shift_l = normal ? ls : max_shift_left;
+    m <<= shift_l;
+    e = ac_int<1,true>(!m_zero) & (e - shift_l);
+    return normal;
+  }
+
+  ac_float( double d, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
+    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cdouble_t::width + I_EXT - 1,  };
+    ac_private::ac_float_cdouble_t t = ac_private::double_to_ac_float(d);
+    ac_float r(t, assert_on_overflow, assert_on_rounding);
+    *this = r;
+  }
+
+  ac_float( float f, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
+    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cfloat_t::width + I_EXT - 1,  };
+    ac_private::ac_float_cfloat_t t = ac_private::float_to_ac_float(f);
+    ac_float r(t, assert_on_overflow, assert_on_rounding);
+    *this = r;
+  }
+
+  template<AC_FL_T(2)>
+  bool compare(const AC_FL(2) &op2, bool *gt) const {
+    typedef ac_fixed<W2,I,S2> fx2_t;
+    typedef typename ac_fixed<W,I,S>::template rt_T< fx2_t >::logic fx_t;
+    typedef ac_fixed<fx_t::width,fx_t::i_width,false> fxu_t;
+
+    fx2_t op2_m_0;
+    op2_m_0.set_slc(0, op2.m.template slc<W2>(0));
+
+    fx_t op1_m = m;
+    fx_t op2_m = op2_m_0;
+    int e_dif = exp() - op2.exp() + I - I2;
+    bool op2_m_neg = op2_m[fx_t::width-1];
+    fx_t out_bits = op2_m ^ ((op2_m_neg & e_dif < 0) ? ~fx_t(0) : fx_t(0));
+    out_bits &= ~(fxu_t(~fxu_t(0)) << e_dif);
+    op2_m >>= e_dif;
+    bool overflow = e_dif < 0 & !!out_bits | op2_m_neg ^ op2_m[fx_t::width-1];
+
+    *gt = overflow & op2_m_neg | !overflow & op1_m > op2_m;
+    bool eq = op1_m == op2_m & !overflow & !out_bits;
+    return eq;
+  }
+
+  template<AC_FL_T(2), AC_FL_T(R)>
+  void plus_minus(const AC_FL(2) &op2, AC_FL(R) &r, bool sub=false) const {
+    typedef AC_FL(2) op2_t;
+    enum { IT = AC_MAX(I,I2) };
+    typedef ac_fixed<W, IT, true> fx1_t;
+    typedef ac_fixed<W2, IT, true> fx2_t;
+    // covers fx1_t and r mantissas (adds additional LSBs if WR > W)
+    typedef typename fx1_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx1r_t;
+    // covers fx2_t and r mantissas (adds additional LSBs if WR > W2)
+    typedef typename fx2_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx2r_t;
+    // mt_t adds one integer bit for the plus
+    //  op1_m, op2_m, op_sl, sticky_bits
+    typedef typename fx1r_t::template rt_T<fx2r_t>::plus mt_t;
+
+    const bool round_bit_needed = QR != AC_TRN;
+    const bool remaining_bits_needed = !(QR == AC_TRN || QR == AC_RND);
+
+    const int w_r_with_round_bits = WR + round_bit_needed;
+
+    // naming: sn = subnormal, n = normal, wc = worst case
+    // worst case (wc) normalize is when one operand has smallest subnormal
+    //   and other operand is shifted right so that its MSB lines up with LSB of subnormal
+    const int power_smallest_sn1 = I - W - (1 << (E-1));
+    const int power_smallest_sn2 = I2 - W2 - (1 << (E2-1));
+    const int power_smallest_sn_dif1 = AC_MAX(0,power_smallest_sn2 - power_smallest_sn1);
+    const int power_smallest_sn_dif2 = AC_MAX(0,power_smallest_sn1 - power_smallest_sn2);
+    const int wc_norm_shift1 = W2-1 + AC_MIN(power_smallest_sn_dif1, W-1);
+    const int wc_norm_shift2 = W-1 + AC_MIN(power_smallest_sn_dif2, W2-1);
+    const int wc_sn_norm_shift = AC_MAX(wc_norm_shift1, wc_norm_shift2);
+    const int w_sn_overlap = wc_sn_norm_shift + 1;
+
+    // cases when one operand is subnormal and other is shifted right and does not overlap bits
+    //   subnormal op could be normalized by width-1 bits
+    const int w_sn_no_overlap1 = W + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif2);
+    const int w_sn_no_overlap2 = W2 + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif1);
+    const int w_sn_no_overlap = AC_MAX(w_sn_no_overlap1, w_sn_no_overlap2);
+
+    const int w_sn = AC_MAX(w_sn_overlap, w_sn_no_overlap);
+
+    // For example 0100 + (1000 0001 >> 1) = 0000 0000 1,  wc_n_norm_shift = max(4,8)
+    const int msb0h1 = I-1 + (int) MAX_EXP;
+    const int msb1h1 = msb0h1-1;
+    const int msb0l1 = I-1 + (int) MIN_EXP;
+    const int msb1l1 = msb0h1-1;
+    const int msb0h2 = I2-1 + (int) op2_t::MAX_EXP;
+    const int msb1h2 = msb0h2-1;
+    const int msb0l2 = I2-1 + (int) op2_t::MIN_EXP;
+    const int msb1l2 = msb0h2-1;
+    // bit W-1 overlap with bit W2-2
+    const bool msb_overlap1 = msb1h2 >= msb0h1 && msb0h1 <= msb1l2
+      || msb1h2 >= msb0l1 && msb0l1 <= msb1l2
+      || msb0h1 >= msb1h2 && msb1h2 >= msb0l1;
+    // bit W2-1 overlap with bit W1-2
+    const bool msb_overlap2 = msb1h1 >= msb0h2 && msb0h2 <= msb1l1
+      || msb1h1 >= msb0l2 && msb0l2 <= msb1l1
+      || msb0h2 >= msb1h1 && msb1h1 >= msb0l2;
+    const bool msb_overlap = msb_overlap1 || msb_overlap2;
+    const int wc_n_norm_shift = AC_MAX(W,W2);
+    const int w_n_msb_overlap = msb_overlap ? wc_n_norm_shift + 1 : 0;
+    // addition of two numbers of different sign can result in a normalization by 1 (therefore + 1)
+    const int w_n_no_msb_overlap = w_r_with_round_bits + 1;
+    const int w_n = AC_MAX(w_n_msb_overlap, w_n_no_msb_overlap);
+
+    // +1 is to prevent overflow during addition
+    const int tr_t_width = AC_MAX(w_n, w_sn) + 1;
+    typedef ac_fixed<tr_t_width,IT+1,true> add_t;
+
+    const int min_E = (int) MIN_EXP + I-IT;
+    const int min_E2 = (int) AC_FL(2)::MIN_EXP + I2-IT;
+    const int min_ET = AC_MIN(min_E, min_E2);
+
+    const int max_E = (int) MAX_EXP + I-IT;
+    const int max_E2 = (int) AC_FL(2)::MAX_EXP + I2-IT;
+    const int max_ET = AC_MAX(max_E, max_E2);
+
+    ac_fixed<mt_t::width, I+1, mt_t::sign> op1_m_0 = m;
+    mt_t op1_m = 0;
+    op1_m.set_slc(0, op1_m_0.template slc<mt_t::width>(0));
+    int op1_e = exp() + I-IT;
+
+    ac_fixed<mt_t::width, I2+1, mt_t::sign> op2_m_0 = op2.m;
+    mt_t op2_m = 0;
+    op2_m.set_slc(0, op2_m_0.template slc<mt_t::width>(0));
+    if(sub)
+      op2_m = -op2_m;
+    int op2_e = op2.exp() + I2-IT;
+
+    bool op1_zero = operator !();
+    bool op2_zero = !op2;
+    int e_dif = op1_e - op2_e;
+    bool e1_lt_e2 = e_dif < 0;
+    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
+
+    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
+    mt_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
+
+    bool sticky_bit = false;
+    if(remaining_bits_needed) {
+      mt_t shifted_out_bits = op_lshift;
+      // bits that are shifted out of a add_t (does not include potential 3 spare bits)
+      shifted_out_bits &= ~((~add_t(0)) << e_dif);
+      sticky_bit = !!shifted_out_bits;
+    }
+    op_lshift >>= e_dif;
+
+    add_t add_r = op_lshift + op_no_shift;
+    int e_t = (e1_lt_e2 & !op2_zero | op1_zero ? op2_e : op1_e);
+
+    r.template assign_from<min_ET,max_ET>(add_r, e_t, sticky_bit, true);
+  }
+
+  template<AC_FL_T(1), AC_FL_T(2)>
+  ac_float add(const AC_FL(1) &op1, const AC_FL(2) &op2) {
+    op1.plus_minus(op2, *this);
+    return *this;
+  }
+
+  template<AC_FL_T(1), AC_FL_T(2)>
+  ac_float sub(const AC_FL(1) &op1, const AC_FL(2) &op2) {
+    op1.plus_minus(op2, *this, true);
+    return *this;
+  }
+
+  typename rt_unary::neg abs() const {
+    typedef typename rt_unary::neg r_t;
+    r_t r;
+    r.m = is_neg() ? -m : r_t::mant_t(m);
+    r.e = e;
+    return r;
+  }
+
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+  // These will be changed!!! For now only enable to explore integration with ac_complex
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::plus operator +(const AC_FL(2) &op2) const {
+    typename rt< AC_FL_TV0(2) >::plus r;
+    plus_minus(op2, r);
+    return r;
+  }
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::minus operator -(const AC_FL(2) &op2) const {
+    typename rt< AC_FL_TV0(2) >::minus r;
+    plus_minus(op2, r, true);
+    return r;
+  }
+#endif
+
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::mult operator *(const AC_FL(2) &op2) const {
+    typedef typename rt< AC_FL_TV0(2) >::mult r_t;
+    r_t r(m*op2.m, exp()+op2.exp(), false);
+    return r;
+  }
+
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::div operator /(const AC_FL(2) &op2) const {
+    typename rt< AC_FL_TV0(2) >::div r(m/op2.m, exp()-op2.exp());
+    return r;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator +=(const AC_FL(2) &op2) {
+    ac_float r;
+    plus_minus(op2, r);
+    *this = r;
+    return *this;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator -=(const AC_FL(2) &op2) {
+    ac_float r;
+    plus_minus(op2, r, true);
+    *this = r;
+    return *this;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator *=(const AC_FL(2) &op2) {
+    *this = *this * op2;
+    return *this;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator /=(const AC_FL(2) &op2) {
+    *this = *this / op2;
+    return *this;
+  }
+  ac_float operator + () const {
+    return *this;
+  }
+  typename rt_unary::neg operator - () const {
+    typename rt_unary::neg r;
+    r.m = -m;
+    r.e = e;
+    return r;
+  }
+  bool operator ! () const {
+    return !m;
+  }
+
+  // Shift --------------------------------------------------------------------
+  template<int WI, bool SI>
+  typename rt_i<WI,SI>::lshift operator << ( const ac_int<WI,SI> &op2 ) const {
+    typename rt_i<WI,SI>::lshift r;
+    r.m = m;
+    r.e = e + op2;
+    return r;
+  }
+  template<int WI, bool SI>
+  typename rt_i<WI,SI>::rshift operator >> ( const ac_int<WI,SI> &op2 ) const {
+    typename rt_i<WI,SI>::rshift r;
+    r.m = m;
+    r.e = e - op2;
+    return r;
+  }
+  // Shift assign -------------------------------------------------------------
+  template<int WI, bool SI>
+  ac_float &operator <<= ( const ac_int<WI,SI> &op2 ) {
+    *this = operator << (op2);
+    return *this;
+  }
+  template<int WI, bool SI>
+  ac_float &operator >>= ( const ac_int<WI,SI> &op2 ) {
+    *this = operator >> (op2);
+    return *this;
+  }
+
+  template<AC_FL_T(2)>
+  bool operator == (const AC_FL(2) &f) const {
+    bool gt;
+    return compare(f, &gt);
+  }
+  template<AC_FL_T(2)>
+  bool operator != (const AC_FL(2) &f) const {
+    return !operator == (f);
+  }
+  template<AC_FL_T(2)>
+  bool operator < (const AC_FL(2) &f) const {
+    bool gt;
+    bool eq = compare(f, &gt);
+    return !(eq | gt);
+  }
+  template<AC_FL_T(2)>
+  bool operator >= (const AC_FL(2) &f) const {
+    return !operator < (f);
+  }
+  template<AC_FL_T(2)>
+  bool operator > (const AC_FL(2) &f) const {
+    bool gt;
+    compare(f, &gt);
+    return gt;
+  }
+  template<AC_FL_T(2)>
+  bool operator <= (const AC_FL(2) &f) const {
+    return !operator > (f);
+  }
+
+  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false, bool hw=true) const {
+    // TODO: printing decimal with exponent
+    if(!hw) {
+      ac_fixed<W,0,S> mantissa;
+      mantissa.set_slc(0, m.template slc<W>(0));
+      std::string r = mantissa.to_string(base_rep, sign_mag);
+      r += "e2";
+      r += (e + I).to_string(base_rep, sign_mag | base_rep == AC_DEC);
+      return r;
+    } else {
+      std::string r = m.to_string(base_rep, sign_mag);
+      if(base_rep != AC_DEC)
+        r += "_";
+      r += "e2";
+      if(base_rep != AC_DEC)
+        r += "_";
+      if(E)
+        r += e.to_string(base_rep, sign_mag | base_rep == AC_DEC);
+      else
+        r += "0";
+      return r;
+    }
+  }
+
+  inline static std::string type_name() {
+    const char *tf[] = {"false", "true" };
+    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV" };
+    std::string r = "ac_float<";
+    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
+    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
+    r += ac_int<32,true>(E).to_string(AC_DEC) + ',';
+    r += tf[S];
+    r += ',';
+    r += q[Q];
+    r += '>';
+    return r;
+  }
+
+  template<ac_special_val V>
+  inline ac_float &set_val() {
+    m.template set_val<V>();
+    if(V == AC_VAL_MIN)
+      e.template set_val<AC_VAL_MAX>();
+    else if(V == AC_VAL_QUANTUM)
+      e.template set_val<AC_VAL_MIN>();
+    else
+      e.template set_val<V>();
+    return *this;
+  }
+};
+
+namespace ac_private {
+  template<typename T>
+  bool ac_fpclassify(T x, bool &inf) {
+    bool nan = !(x==x);
+    if(!nan) {
+      T d = x - x;
+      inf = !(d==d);
+    }
+    return nan;
+  }
+
+  inline ac_float_cdouble_t double_to_ac_float(double d) {
+    typedef ac_float_cdouble_t r_t;
+#ifndef __SYNTHESIS__
+    bool inf;
+    bool nan = ac_fpclassify(d, inf);
+    if(nan)
+      AC_ASSERT(0, "In conversion from double to ac_float: double is NaN");
+    else if(inf)
+      AC_ASSERT(0, "In conversion from double to ac_float: double is Infinite");
+#endif
+    r_t::exp_t exp;
+    r_t::mant_t mant = ac::frexp_d(d, exp);
+    return r_t(mant, exp, false);
+  }
+
+  inline ac_float_cfloat_t float_to_ac_float(float f) {
+    typedef ac_float_cfloat_t r_t;
+#ifndef __SYNTHESIS__
+    bool inf;
+    bool nan = ac_fpclassify(f, inf);
+    if(nan)
+      AC_ASSERT(0, "In conversion from float to ac_float: float is NaN");
+    else if(inf)
+      AC_ASSERT(0, "In conversion from float to ac_float: float is Infinite");
+#endif
+    r_t::exp_t exp;
+    r_t::mant_t mant = ac::frexp_f(f, exp);
+    return r_t(mant, exp, false);
+  }
+};
+
+namespace ac {
+  template<typename T>
+  struct ac_float_represent {
+    typedef typename ac_fixed_represent<T>::type fx_t;
+    typedef ac_float<fx_t::width+!fx_t::sign,fx_t::i_width+!fx_t::sign,1,fx_t::q_mode> type;
+  };
+  template<> struct ac_float_represent<float> {
+    typedef ac_private::ac_float_cfloat_t type;
+  };
+  template<> struct ac_float_represent<double> {
+    typedef ac_private::ac_float_cdouble_t type;
+  };
+}
+
+namespace ac_private {
+  // with T == ac_float
+  template< AC_FL_T0(2) >
+  struct rt_ac_float_T< AC_FL0(2) > {
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+  // with T == ac_fixed
+  template<int WFX, int IFX, bool SFX>
+  struct rt_ac_float_T< ac_fixed<WFX,IFX,SFX> > {
+    // For now E2 > 0
+    enum { E2 = 1, S2 = true, W2 = WFX + !SFX, I2 = IFX + !SFX };
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+  // with T == ac_int
+  template<int WI, bool SI>
+  struct rt_ac_float_T< ac_int<WI,SI> > {
+    // For now E2 > 0
+    enum { E2 = 1, S2 = true, I2 = WI + !SI, W2 = I2 };
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+
+  // Multiplication is optimizable, general operator +/- is not yet supported
+  template<typename T>
+  struct rt_ac_float_T< c_type<T> > {
+    // For now E2 > 0
+    enum { SCT = c_type_params<T>::S, S2 = true, W2 = c_type_params<T>::W + !SCT, I2 = c_type_params<T>::I + !SCT, E2 = AC_MAX(1, c_type_params<T>::E) };
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+}
+
+// Stream --------------------------------------------------------------------
+
+#ifndef __SYNTHESIS__
+template<AC_FL_T()>
+inline std::ostream& operator << (std::ostream &os, const AC_FL() &x) {
+  os << x.to_string(AC_DEC);
+  return os;
+}
+#endif
+
+#define FL_BIN_OP_WITH_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
+  template< AC_FL_T() > \
+  inline typename AC_FL()::template rt_T2<C_TYPE>::RTYPE operator BIN_OP ( C_TYPE c_op, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return fl2_t(c_op).operator BIN_OP (op);  \
+  } \
+  template< AC_FL_T() > \
+  inline typename AC_FL()::template rt_T<C_TYPE>::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE c_op) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return op.operator BIN_OP (fl2_t(c_op));  \
+  }
+
+#define FL_REL_OP_WITH_CTYPE(REL_OP, C_TYPE)  \
+  template< AC_FL_T() > \
+  inline bool operator REL_OP ( const AC_FL() &op, C_TYPE op2) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return op.operator REL_OP (fl2_t(op2));  \
+  }  \
+  template< AC_FL_T() > \
+  inline bool operator REL_OP ( C_TYPE op2, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return fl2_t(op2).operator REL_OP (op);  \
+  }
+
+#define FL_ASSIGN_OP_WITH_CTYPE_2(ASSIGN_OP, C_TYPE)  \
+  template< AC_FL_T() > \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE op2) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return op.operator ASSIGN_OP (fl2_t(op2));  \
+  }
+
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
+  FL_BIN_OP_WITH_CTYPE(+, C_TYPE, plus) \
+  FL_BIN_OP_WITH_CTYPE(-, C_TYPE, minus)
+#else
+#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE)
+#endif
+
+#define FL_OPS_WITH_CTYPE(C_TYPE) \
+  FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
+  FL_BIN_OP_WITH_CTYPE(*, C_TYPE, mult) \
+  FL_BIN_OP_WITH_CTYPE(/, C_TYPE, div) \
+  \
+  FL_REL_OP_WITH_CTYPE(==, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(!=, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(>, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(>=, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(<, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(<=, C_TYPE) \
+  \
+  FL_ASSIGN_OP_WITH_CTYPE_2(+=, C_TYPE) \
+  FL_ASSIGN_OP_WITH_CTYPE_2(-=, C_TYPE) \
+  FL_ASSIGN_OP_WITH_CTYPE_2(*=, C_TYPE) \
+  FL_ASSIGN_OP_WITH_CTYPE_2(/=, C_TYPE)
+
+#define FL_SHIFT_OP_WITH_INT_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
+  template< AC_FL_T() > \
+  inline typename AC_FL()::template rt_i< ac_private::c_type_params<C_TYPE>::W, ac_private::c_type_params<C_TYPE>::S >::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE i_op) {  \
+    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
+    return op.operator BIN_OP (i_t(i_op));  \
+  }
+
+#define FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(ASSIGN_OP, C_TYPE)  \
+  template< AC_FL_T() > \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE i_op) {  \
+    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
+    return op.operator ASSIGN_OP (i_t(i_op));  \
+  }
+
+#define FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE) \
+  FL_SHIFT_OP_WITH_INT_CTYPE(>>, C_TYPE, rshift) \
+  FL_SHIFT_OP_WITH_INT_CTYPE(<<, C_TYPE, lshift) \
+  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(>>=, C_TYPE) \
+  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(<<=, C_TYPE)
+
+#define FL_OPS_WITH_INT_CTYPE(C_TYPE) \
+  FL_OPS_WITH_CTYPE(C_TYPE) \
+  FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE)
+
+// --------------------------------------- End of Macros for Binary Operators with C Floats
+
+    // Binary Operators with C Floats --------------------------------------------
+    FL_OPS_WITH_CTYPE(float)
+    FL_OPS_WITH_CTYPE(double)
+    FL_OPS_WITH_INT_CTYPE(bool)
+    FL_OPS_WITH_INT_CTYPE(char)
+    FL_OPS_WITH_INT_CTYPE(signed char)
+    FL_OPS_WITH_INT_CTYPE(unsigned char)
+    FL_OPS_WITH_INT_CTYPE(short)
+    FL_OPS_WITH_INT_CTYPE(unsigned short)
+    FL_OPS_WITH_INT_CTYPE(int)
+    FL_OPS_WITH_INT_CTYPE(unsigned int)
+    FL_OPS_WITH_INT_CTYPE(long)
+    FL_OPS_WITH_INT_CTYPE(unsigned long)
+    FL_OPS_WITH_INT_CTYPE(Slong)
+    FL_OPS_WITH_INT_CTYPE(Ulong)
+    // -------------------------------------- End of Binary Operators with C Floats
+
+// Macros for Binary Operators with ac_int --------------------------------------------
+
+#define FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return fl2_t(i_op).operator BIN_OP (op);  \
+  }
+
+#define FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_int<WI,SI> &i_op) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return op.operator BIN_OP (fl2_t(i_op));  \
+  }
+
+#define FL_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
+  FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
+  FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
+
+#define FL_REL_OP_WITH_AC_INT(REL_OP)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline bool operator REL_OP ( const AC_FL() &op, const ac_int<WI,SI> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return op.operator REL_OP (fl2_t(op2));  \
+  }  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return fl2_t(op2).operator REL_OP (op);  \
+  }
+
+#define FL_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_int<WI,SI> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return op.operator ASSIGN_OP (fl2_t(op2));  \
+  }
+
+// -------------------------------------------- End of Macros for Binary Operators with ac_int
+
+    // Binary Operators with ac_int --------------------------------------------
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+    FL_BIN_OP_WITH_AC_INT(+, plus)
+    FL_BIN_OP_WITH_AC_INT(-, minus)
+#endif
+    FL_BIN_OP_WITH_AC_INT(*, mult)
+    FL_BIN_OP_WITH_AC_INT(/, div)
+
+    FL_REL_OP_WITH_AC_INT(==)
+    FL_REL_OP_WITH_AC_INT(!=)
+    FL_REL_OP_WITH_AC_INT(>)
+    FL_REL_OP_WITH_AC_INT(>=)
+    FL_REL_OP_WITH_AC_INT(<)
+    FL_REL_OP_WITH_AC_INT(<=)
+
+    FL_ASSIGN_OP_WITH_AC_INT(+=)
+    FL_ASSIGN_OP_WITH_AC_INT(-=)
+    FL_ASSIGN_OP_WITH_AC_INT(*=)
+    FL_ASSIGN_OP_WITH_AC_INT(/=)
+    FL_ASSIGN_OP_WITH_AC_INT(%=)
+    // -------------------------------------- End of Binary Operators with ac_int
+
+// Macros for Binary Operators with ac_fixed --------------------------------------------
+
+#define FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const ac_fixed<WF,IF,SF,QF,OF> &f_op, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return fl2_t(f_op).operator BIN_OP (op);  \
+  }
+
+#define FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &f_op) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return op.operator BIN_OP (fl2_t(f_op));  \
+  }
+
+#define FL_BIN_OP_WITH_AC_FIXED(BIN_OP, RTYPE)  \
+  FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE) \
+  FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)
+
+#define FL_REL_OP_WITH_AC_FIXED(REL_OP)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline bool operator REL_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return op.operator REL_OP (fl2_t(op2));  \
+  }  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline bool operator REL_OP ( ac_fixed<WF,IF,SF,QF,OF> &op2, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return fl2_t(op2).operator REL_OP (op);  \
+  }
+
+#define FL_ASSIGN_OP_WITH_AC_FIXED(ASSIGN_OP)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return op.operator ASSIGN_OP (fl2_t(op2));  \
+  }
+
+// -------------------------------------------- End of Macros for Binary Operators with ac_fixed
+
+    // Binary Operators with ac_fixed --------------------------------------------
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+    FL_BIN_OP_WITH_AC_FIXED(+, plus)
+    FL_BIN_OP_WITH_AC_FIXED(-, minus)
+#endif
+    FL_BIN_OP_WITH_AC_FIXED(*, mult)
+    FL_BIN_OP_WITH_AC_FIXED(/, div)
+
+    FL_REL_OP_WITH_AC_FIXED(==)
+    FL_REL_OP_WITH_AC_FIXED(!=)
+    FL_REL_OP_WITH_AC_FIXED(>)
+    FL_REL_OP_WITH_AC_FIXED(>=)
+    FL_REL_OP_WITH_AC_FIXED(<)
+    FL_REL_OP_WITH_AC_FIXED(<=)
+
+    FL_ASSIGN_OP_WITH_AC_FIXED(+=)
+    FL_ASSIGN_OP_WITH_AC_FIXED(-=)
+    FL_ASSIGN_OP_WITH_AC_FIXED(*=)
+    FL_ASSIGN_OP_WITH_AC_FIXED(/=)
+    // -------------------------------------- End of Binary Operators with ac_fixed
+
+// Global templatized functions for easy initialization to special values
+template<ac_special_val V, AC_FL_T()>
+inline AC_FL() value( AC_FL() ) {
+  AC_FL() r;
+  return r.template set_val<V>();
+}
+
+namespace ac {
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, AC_FL_T() >
+  inline bool init_array( AC_FL() *a, int n) {
+    AC_FL0() t;
+    t.template set_val<V>();
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_FLOAT_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_int.h b/hls4ml/templates/quartus/ac_types/ac_int.h
index 4651339169..bb9542642e 100644
--- a/hls4ml/templates/quartus/ac_types/ac_int.h
+++ b/hls4ml/templates/quartus/ac_types/ac_int.h
@@ -1,3099 +1,3099 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_int.h
-//  Description:    fast arbitrary-length bit-accurate integer types:
-//                    - unsigned integer of length W:  ac_int<W,false>
-//                    - signed integer of length W:  ac_int<W,true>
-//  Author:         Andres Takach, Ph.D.
-//  Notes:
-//   - C++ Runtime: important to use optimization flag (for example -O3)
-//
-//   - Compiler support: recent GNU compilers are required for correct
-//     template compilation
-//
-//   - Most frequent migration issues:
-//      - need to cast to common type when using question mark operator:
-//          (a < 0) ? -a : a;  // a is ac_int<W,true>
-//        change to:
-//          (a < 0) ? -a : (ac_int<W+1,true>) a;
-//        or
-//          (a < 0) ? (ac_int<W+1,false>) -a : (ac_int<W+1,false>) a;
-//
-//      - left shift is not arithmetic ("a<<n" has same bitwidth as "a")
-//          ac_int<W+1,false> b = a << 1;  // a is ac_int<W,false>
-//        is not equivalent to b=2*a. In order to get 2*a behavior change to:
-//          ac_int<W+1,false> b = (ac_int<W+1,false>)a << 1;
-//
-//      - only static length read/write slices are supported:
-//         - read:  x.slc<4>(k) => returns ac_int for 4-bit slice x(4+k-1 DOWNTO k)
-//         - write: x.set_slc(k,y) = writes bits of y to x starting at index k
-*/
-
-#ifndef __AC_INT_H
-#define __AC_INT_H
-
-#define AC_VERSION 3
-#define AC_VERSION_MINOR 9
-
-#ifndef __cplusplus
-#error C++ is required to include this header file
-#endif
-
-#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
-#error GCC version 3 or greater is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
-#error Microsoft Visual Studio 8 or newer is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4127 4100 4244 4307 4310 4365 4514 4554 4706 4800 )
-#endif
-
-// for safety
-#if (defined(N) || defined(N2))
-#error One or more of the following is defined: N, N2. Definition conflicts with their usage as template parameters.
-#error DO NOT use defines before including third party header files.
-#endif
-
-// for safety
-#if (defined(W) || defined(I) || defined(S) || defined(W2) || defined(I2) || defined(S2))
-#error One or more of the following is defined: W, I, S, W2, I2, S2. Definition conflicts with their usage as template parameters.
-#error DO NOT use defines before including third party header files.
-#endif
-
-#if defined(true)
-#warning The C++ keyword true is defined which may result in subtle compilation problems. Undefining it.
-#undef true
-#endif
-#if defined(false)
-#warning The C++ keyword false is defined which may result in subtle compilation problems. Undefining it.
-#undef false
-#endif
-
-#ifndef __ASSERT_H__
-#define __ASSERT_H__
-#include <assert.h>
-#endif
-#include <limits>
-#ifndef AC_USER_DEFINED_ASSERT
-#include <iostream>
-#else
-#include <ostream>
-#endif
-#include <math.h>
-#include <string>
-
-#ifndef __SYNTHESIS__
-#ifndef __AC_INT_UTILITY_BASE
-#define __AC_INT_UTILITY_BASE
-#endif
-
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-#define AC_MAX(a,b) ((a) > (b) ? (a) : (b))
-#define AC_MIN(a,b) ((a) < (b) ? (a) : (b))
-#define AC_ABS(a) ((a) < 0 ? -(a) : (a))
-
-#if defined(_MSC_VER)
-typedef unsigned __int64 Ulong;
-typedef signed   __int64 Slong;
-#else
-typedef unsigned long long Ulong;
-typedef signed   long long Slong;
-#endif
-
-enum ac_base_mode { AC_BIN=2, AC_OCT=8, AC_DEC=10, AC_HEX=16 };
-enum ac_special_val {AC_VAL_DC, AC_VAL_0, AC_VAL_MIN, AC_VAL_MAX, AC_VAL_QUANTUM};
-
-template <int W, bool S> class ac_int;
-
-namespace ac_private {
-#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-#pragma builtin
-#endif
-
-  enum {long_w = std::numeric_limits<unsigned long>::digits};
-  const unsigned int all_ones = (unsigned) ~0;
-
-  // PRIVATE FUNCTIONS in namespace: for implementing ac_int/ac_fixed
-
-#ifndef __SYNTHESIS__
-  inline double mgc_floor(double d) { return floor(d); }
-#else
-  inline double mgc_floor(double d) { return 0.0; }
-#endif
-
-  #define AC_ASSERT(cond, msg) ac_private::ac_assert(cond, __FILE__, __LINE__, msg)
-  inline void ac_assert(bool condition, const char *file=0, int line=0, const char *msg=0) {
-  #ifndef __SYNTHESIS__
-    #ifndef AC_USER_DEFINED_ASSERT
-    if(!condition) {
-      std::cerr << "Assert";
-      if(file)
-        std::cerr << " in file " << file << ":" << line;
-      if(msg)
-        std::cerr << " " << msg;
-      std::cerr << std::endl;
-      assert(0);
-    }
-    #else
-    AC_USER_DEFINED_ASSERT(condition, file, line, msg);
-    #endif
-  #endif
-  }
-
-  // helper structs for statically computing log2 like functions (nbits, log2_floor, log2_ceil)
-  //   using recursive templates
-  template<unsigned char N>
-  struct s_N {
-    template<unsigned X>
-    struct s_X {
-      enum {
-        X2 = X >> N,
-        N_div_2 = N >> 1,
-        nbits = X ? (X2 ? N + (int) s_N<N_div_2>::template s_X<X2>::nbits : (int) s_N<N_div_2>::template s_X<X>::nbits) : 0
-      };
-    };
-  };
-  template<> struct s_N<0> {
-    template<unsigned X>
-    struct s_X {
-      enum {nbits = !!X };
-    };
-  };
-
-  template<int N>
-  inline double ldexpr32(double d) {
-    double d2 = d;
-    if(N < 0)
-      for(int i=0; i < -N; i++)
-        d2 /= (Ulong) 1 << 32;
-    else
-      for(int i=0; i < N; i++)
-        d2 *= (Ulong) 1 << 32;
-    return d2;
-  }
-  template<> inline double ldexpr32<0>(double d) { return d; }
-  template<> inline double ldexpr32<1>(double d) { return d * ((Ulong) 1 << 32); }
-  template<> inline double ldexpr32<-1>(double d) { return d / ((Ulong) 1 << 32); }
-  template<> inline double ldexpr32<2>(double d) { return (d * ((Ulong) 1 << 32)) * ((Ulong) 1 << 32); }
-  template<> inline double ldexpr32<-2>(double d) { return (d / ((Ulong) 1 << 32)) / ((Ulong) 1 << 32); }
-
-  template<int N>
-  inline double ldexpr(double d) {
-    return ldexpr32<N/32>( N < 0 ? d/( (unsigned) 1 << (-N & 31)) : d * ( (unsigned) 1 << (N & 31)));
-  }
-
-  template<int N>
-  inline void iv_copy(const int *op, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op[i];
-  }
-  template<> inline void iv_copy<1>(const int *op, int *r) {
-    r[0] = op[0];
-  }
-  template<> inline void iv_copy<2>(const int *op, int *r) {
-    r[0] = op[0];
-    r[1] = op[1];
-  }
-
-  template<int N>
-  inline bool iv_equal_zero(const int *op){
-    for(int i=0; i < N; i++)
-      if(op[i])
-        return false;
-    return true;
-  }
-  template<> inline bool iv_equal_zero<0>(const int * /*op*/) { return true; }
-  template<> inline bool iv_equal_zero<1>(const int *op) {
-    return !op[0];
-  }
-  template<> inline bool iv_equal_zero<2>(const int *op) {
-    return !(op[0] || op[1]);
-  }
-
-  template<int N>
-  inline bool iv_equal_ones(const int *op){
-    for(int i=0; i < N; i++)
-      if(~op[i])
-        return false;
-    return true;
-  }
-  template<> inline bool iv_equal_ones<0>(const int * /*op*/) { return true; }
-  template<> inline bool iv_equal_ones<1>(const int *op) {
-    return !~op[0];
-  }
-  template<> inline bool iv_equal_ones<2>(const int *op) {
-    return !(~op[0] || ~op[1]);
-  }
-
-  template<int N1, int N2>
-  inline bool iv_equal(const int *op1, const int *op2){
-    const int M1 = AC_MAX(N1,N2);
-    const int M2 = AC_MIN(N1,N2);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-    for(int i=0; i < M2; i++)
-      if(OP1[i] != OP2[i])
-        return false;
-    int ext = OP2[M2-1] < 0 ? ~0 : 0;
-    for(int i=M2; i < M1; i++)
-      if(OP1[i] != ext)
-        return false;
-    return true;
-  }
-  template<> inline bool iv_equal<1,1>(const int *op1, const int *op2) {
-    return op1[0] == op2[0];
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_ones_from(const int *op){
-    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] >> (B&31))))
-      return false;
-    return iv_equal_ones<N-(B+31)/32>(&op[(B+31)/32]);
-  }
-  template<> inline bool  iv_equal_ones_from<0,1>(const int *op){
-    return iv_equal_ones<1>(op);
-  }
-  template<> inline bool  iv_equal_ones_from<0,2>(const int *op){
-    return iv_equal_ones<2>(op);
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_zeros_from(const int *op){
-    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] >> (B&31))))
-      return false;
-    return iv_equal_zero<N-(B+31)/32>(&op[(B+31)/32]);
-  }
-  template<> inline bool  iv_equal_zeros_from<0,1>(const int *op){
-    return iv_equal_zero<1>(op);
-  }
-  template<> inline bool  iv_equal_zeros_from<0,2>(const int *op){
-    return iv_equal_zero<2>(op);
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_ones_to(const int *op){
-    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] | (all_ones << (B&31)))))
-      return false;
-    return iv_equal_ones<B/32>(op);
-  }
-  template<> inline bool  iv_equal_ones_to<0,1>(const int *op){
-    return iv_equal_ones<1>(op);
-  }
-  template<> inline bool  iv_equal_ones_to<0,2>(const int *op){
-    return iv_equal_ones<2>(op);
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_zeros_to(const int *op){
-    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] & ~(all_ones << (B&31)))))
-      return false;
-    return iv_equal_zero<B/32>(op);
-  }
-  template<> inline bool  iv_equal_zeros_to<0,1>(const int *op){
-    return iv_equal_zero<1>(op);
-  }
-  template<> inline bool  iv_equal_zeros_to<0,2>(const int *op){
-    return iv_equal_zero<2>(op);
-  }
-
-  template<int N1, int N2, bool greater>
-  inline bool iv_compare(const int *op1, const int *op2){
-    const int M1 = AC_MAX(N1,N2);
-    const int M2 = AC_MIN(N1,N2);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-    const bool b = (N1 >= N2) == greater;
-    int ext = OP2[M2-1] < 0 ? ~0 : 0;
-    int i2 = M1 > M2 ? ext : OP2[M1-1];
-    if(OP1[M1-1] != i2)
-      return b ^ (OP1[M1-1] < i2);
-    for(int i=M1-2; i >= M2; i--) {
-      if((unsigned) OP1[i] != (unsigned) ext)
-        return b ^ ((unsigned) OP1[i] < (unsigned) ext);
-    }
-    for(int i=M2-1; i >= 0; i--) {
-      if((unsigned) OP1[i] != (unsigned) OP2[i])
-        return b ^ ((unsigned) OP1[i] < (unsigned) OP2[i]);
-    }
-    return false;
-  }
-  template<> inline bool iv_compare<1,1,true>(const int *op1, const int *op2) {
-    return op1[0] > op2[0];
-  }
-  template<> inline bool iv_compare<1,1,false>(const int *op1, const int *op2) {
-    return op1[0] < op2[0];
-  }
-
-  template<int N>
-  inline void iv_extend(int *r, int ext) {
-    for(int i=0; i < N; i++)
-      r[i] = ext;
-  }
-  template<> inline void iv_extend<-2>(int * /*r*/, int /*ext*/) { }
-  template<> inline void iv_extend<-1>(int * /*r*/, int /*ext*/) { }
-  template<> inline void iv_extend<0>(int * /*r*/, int /*ext*/) { }
-  template<> inline void iv_extend<1>(int *r, int ext) {
-    r[0] = ext;
-  }
-  template<> inline void iv_extend<2>(int *r, int ext) {
-    r[0] = ext;
-    r[1] = ext;
-  }
-
-  template<int Nr>
-  inline void iv_assign_int64(int *r, Slong l) {
-    r[0] = (int) l;
-    if(Nr > 1) {
-      r[1] = (int) (l >> 32);
-      iv_extend<Nr-2>(r+2, (r[1] < 0) ? ~0 : 0);
-    }
-  }
-  template<> inline void iv_assign_int64<1>(int *r, Slong l) {
-    r[0] = (int) l;
-  }
-  template<> inline void iv_assign_int64<2>(int *r, Slong l) {
-    r[0] = (int) l;
-    r[1] = (int) (l >> 32);
-  }
-
-  template<int Nr>
-  inline void iv_assign_uint64(int *r, Ulong l) {
-    r[0] = (int) l;
-    if(Nr > 1) {
-      r[1] = (int) (l >> 32);
-      iv_extend<Nr-2>(r+2, 0);
-    }
-  }
-  template<> inline void iv_assign_uint64<1>(int *r, Ulong l) {
-    r[0] = (int) l;
-  }
-  template<> inline void iv_assign_uint64<2>(int *r, Ulong l) {
-    r[0] = (int) l;
-    r[1] = (int) (l >> 32);
-  }
-
-  inline Ulong mult_u_u(int a, int b) {
-    return (Ulong) (unsigned) a * (Ulong) (unsigned) b;
-  }
-  inline Slong mult_u_s(int a, int b) {
-    return (Ulong) (unsigned) a * (Slong) (signed) b;
-  }
-  inline Slong mult_s_u(int a, int b) {
-    return (Slong) (signed) a * (Ulong) (unsigned) b;
-  }
-  inline Slong mult_s_s(int a, int b) {
-    return (Slong) (signed) a * (Slong) (signed) b;
-  }
-  inline void accumulate(Ulong a, Ulong &l1, Slong &l2) {
-    l1 += (Ulong) (unsigned) a;
-    l2 += a >> 32;
-  }
-  inline void accumulate(Slong a, Ulong &l1, Slong &l2) {
-    l1 += (Ulong) (unsigned) a;
-    l2 += a >> 32;
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_mult(const int *op1, const int *op2, int *r) {
-    if(Nr==1)
-      r[0] = op1[0] * op2[0];
-    else if(N1==1 && N2==1)
-      iv_assign_int64<Nr>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
-    else {
-      const int M1 = AC_MAX(N1,N2);
-      const int M2 = AC_MIN(N1,N2);
-      const int *OP1 = N1 >= N2 ? op1 : op2;
-      const int *OP2 = N1 >= N2 ? op2 : op1;
-      const int T1 = AC_MIN(M2-1,Nr);
-      const int T2 = AC_MIN(M1-1,Nr);
-      const int T3 = AC_MIN(M1+M2-2,Nr);
-
-      Ulong l1 = 0;
-      Slong l2 = 0;
-      for(int k=0; k < T1; k++) {
-        for(int i=0; i < k+1; i++)
-          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
-        l2 += (Ulong) (unsigned) (l1 >> 32);
-        r[k] = (int) l1;
-        l1 = (unsigned) l2;
-        l2 >>= 32;
-      }
-      for(int k=T1; k < T2; k++) {
-        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
-        for(int i=0; i < M2-1; i++)
-          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
-        l2 += (Ulong) (unsigned) (l1 >> 32);
-        r[k] = (int) l1;
-        l1 = (unsigned) l2;
-        l2 >>= 32;
-      }
-      for(int k=T2; k < T3; k++) {
-        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
-        for(int i=k-T2+1; i < M2-1; i++)
-          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
-        accumulate(mult_s_u(OP1[M1-1], OP2[k-M1+1]), l1, l2);
-        l2 += (Ulong) (unsigned) (l1 >> 32);
-        r[k] = (int) l1;
-        l1 = (unsigned) l2;
-        l2 >>= 32;
-      }
-      if(Nr >= M1+M2-1) {
-        accumulate(mult_s_s(OP1[M1-1], OP2[M2-1]), l1, l2);
-        r[M1+M2-2] = (int) l1;
-        if(Nr >= M1+M2) {
-          l2 += (Ulong) (unsigned) (l1 >> 32);
-          r[M1+M2-1] = (int) l2;
-          iv_extend<Nr-(M1+M2)>(r+M1+M2, (r[M1+M2-1] < 0) ? ~0 : 0);
-        }
-      }
-    }
-  }
-  template<> inline void iv_mult<1,1,1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] * op2[0];
-  }
-  template<> inline void iv_mult<1,1,2>(const int *op1, const int *op2, int *r) {
-    iv_assign_int64<2>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
-  }
-
-  template<int N>
-  inline bool iv_uadd_carry(const int *op1, bool carry, int *r) {
-    Slong l = carry;
-    for(int i=0; i < N; i++) {
-      l += (Ulong) (unsigned) op1[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    return l != 0;
-  }
-  template<> inline bool iv_uadd_carry<0>(const int * /*op1*/, bool carry, int * /*r*/) { return carry; }
-  template<> inline bool iv_uadd_carry<1>(const int *op1, bool carry, int *r) {
-    Ulong l = carry + (Ulong) (unsigned) op1[0];
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_add_int_carry(const int *op1, int op2, bool carry, int *r) {
-    if(N==0)
-      return carry;
-    if(N==1) {
-      Ulong l = carry + (Slong) op1[0] + (Slong) op2;
-      r[0] = (int) l;
-      return (l >> 32) & 1;
-    }
-    Slong l = carry + (Ulong) (unsigned) op1[0] + (Slong) op2;
-    r[0] = (int) l;
-    l >>= 32;
-    for(int i=1; i < N-1; i++) {
-      l += (Ulong) (unsigned) op1[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    l += (Slong) op1[N-1];
-    r[N-1] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_add_int_carry<0>(const int * /*op1*/, int /*op2*/, bool carry, int * /*r*/) { return carry; }
-  template<> inline bool iv_add_int_carry<1>(const int *op1, int op2, bool carry, int *r) {
-    Ulong l = carry + (Slong) op1[0] + (Slong) op2;
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_uadd_n(const int *op1, const int *op2, int *r) {
-    Ulong l = 0;
-    for(int i=0; i < N; i++) {
-      l += (Ulong)(unsigned) op1[i] + (Ulong)(unsigned) op2[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    return l & 1;
-  }
-  template<> inline bool iv_uadd_n<0>(const int * /*op1*/, const int * /*op2*/, int * /*r*/) { return false; }
-  template<> inline bool iv_uadd_n<1>(const int *op1, const int *op2, int *r) {
-    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_uadd_n<2>(const int *op1, const int *op2, int *r) {
-    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    l >>= 32;
-    l += (Ulong) (unsigned) op1[1] + (Ulong) (unsigned) op2[1];
-    r[1] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_add(const int *op1, const int *op2, int *r) {
-    if(Nr==1)
-      r[0] = op1[0] + op2[0];
-    else {
-      const int M1 = AC_MAX(N1,N2);
-      const int M2 = AC_MIN(N1,N2);
-      const int *OP1 = N1 >= N2 ? op1 : op2;
-      const int *OP2 = N1 >= N2 ? op2 : op1;
-      const int T1 = AC_MIN(M2-1,Nr);
-      const int T2 = AC_MIN(M1,Nr);
-
-      bool carry = iv_uadd_n<T1>(OP1, OP2, r);
-      carry = iv_add_int_carry<T2-T1>(OP1+T1, OP2[T1], carry, r+T1);
-      iv_extend<Nr-T2>(r+T2, carry ? ~0 : 0);
-    }
-  }
-  template<> inline void iv_add<1,1,1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] + op2[0];
-  }
-  template<> inline void iv_add<1,1,2>(const int *op1, const int *op2, int *r) {
-    iv_assign_int64<2>(r, (Slong) op1[0] + (Slong) op2[0]);
-  }
-
-  template<int N>
-  inline bool iv_sub_int_borrow(const int *op1, int op2, bool borrow, int *r) {
-    if(N==1) {
-      Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
-      r[0] = (int) l;
-      return (l >> 32) & 1;
-    }
-    Slong l = (Ulong) (unsigned) op1[0] - (Slong) op2 - borrow;
-    r[0] = (int) l;
-    l >>= 32;
-    for(int i=1; i < N-1; i++) {
-      l += (Ulong) (unsigned) op1[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    l += (Slong) op1[N-1];
-    r[N-1] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_sub_int_borrow<0>(const int * /*op1*/, int /*op2*/, bool borrow, int * /*r*/) { return borrow; }
-  template<> inline bool iv_sub_int_borrow<1>(const int *op1, int op2, bool borrow, int *r) {
-    Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_sub_int_borrow(int op1, const int *op2, bool borrow, int *r) {
-    if(N==1) {
-      Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
-      r[0] = (int) l;
-      return (l >> 32) & 1;
-    }
-    Slong l = (Slong) op1 - (Ulong) (unsigned) op2[0] - borrow;
-    r[0] = (int) l;
-    l >>= 32;
-    for(int i=1; i < N-1; i++) {
-      l -= (Ulong) (unsigned) op2[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    l -= (Slong) op2[N-1];
-    r[N-1] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_sub_int_borrow<0>(int /*op1*/, const int * /*op2*/, bool borrow, int * /*r*/) { return borrow; }
-  template<> inline bool iv_sub_int_borrow<1>(int op1, const int *op2, bool borrow, int *r) {
-    Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_usub_n(const int *op1, const int *op2, int *r) {
-    Slong l = 0;
-    for(int i=0; i < N; i++) {
-      l += (Ulong)(unsigned) op1[i] - (Ulong)(unsigned) op2[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    return l & 1;
-  }
-  template<> inline bool iv_usub_n<1>(const int *op1, const int *op2, int *r) {
-    Ulong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_usub_n<2>(const int *op1, const int *op2, int *r) {
-    Slong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    l >>= 32;
-    l += (Ulong) (unsigned) op1[1] - (Ulong) (unsigned) op2[1];
-    r[1] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_sub(const int *op1, const int *op2, int *r) {
-    if(Nr==1)
-      r[0] = op1[0] - op2[0];
-    else {
-      const int M1 = AC_MAX(N1,N2);
-      const int M2 = AC_MIN(N1,N2);
-      const int T1 = AC_MIN(M2-1,Nr);
-      const int T2 = AC_MIN(M1,Nr);
-      bool borrow = iv_usub_n<T1>(op1, op2, r);
-      if(N1 > N2)
-        borrow = iv_sub_int_borrow<T2-T1>(op1+T1, op2[T1], borrow, r+T1);
-      else
-        borrow = iv_sub_int_borrow<T2-T1>(op1[T1], op2+T1, borrow, r+T1);
-      iv_extend<Nr-T2>(r+T2, borrow ? ~0 : 0);
-    }
-  }
-  template<> inline void iv_sub<1,1,1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] - op2[0];
-  }
-  template<> inline void iv_sub<1,1,2>(const int *op1, const int *op2, int *r) {
-    iv_assign_int64<2>(r, (Slong) op1[0] - (Slong) op2[0]);
-  }
-
-  template<int N>
-  inline bool iv_all_bits_same(const int *op, bool bit) {
-    int t = bit ? ~0 : 0;
-    for(int i=0; i < N; i++)
-      if(op[i] != t)
-        return false;
-    return true;
-  }
-  template<> inline bool iv_all_bits_same<0>(const int * /*op*/, bool /*bit*/) { return true; }
-  template<> inline bool iv_all_bits_same<1>(const int *op, bool bit) {
-    return op[0] == (bit ? ~0 : 0);
-  }
-
-  template <int N, int Nr>
-  void iv_neg(const int *op1, int *r) {
-    Slong l = 0;
-    for(int k = 0; k < AC_MIN(N,Nr); k++) {
-      l -= (Ulong) (unsigned) op1[k];
-      r[k] = (unsigned) l;
-      l >>= 32;
-    }
-    if(Nr > N) {
-      r[N] = (unsigned) (l - (op1[N-1] < 0 ? ~0 : 0));
-      iv_extend<Nr-N-1>(r+N+1, r[N] < 0 ? ~0 : 0);
-    }
-  }
-
-  template <int N, bool S, int Nr>
-  void iv_abs(const int *op1, int *r) {
-    if( S && op1[N-1] < 0) {
-      iv_neg<N,Nr>(op1, r);
-    } else {
-      iv_copy<AC_MIN(N,Nr)>(op1, r);
-      iv_extend<Nr-N>(r+N, 0);
-    }
-  }
-
-  template<int N, int D, int Q, int R, typename sw2, typename uw2, typename sw4, typename uw4, int w1_length>
-  void iv_udiv(const sw2 *n, const sw2 *d, sw2 *q, sw2 *r) {
-    const int w2_length = 2*w1_length;
-    int d_msi;  // most significant int for d
-    for(d_msi = D-1; d_msi > 0 && !d[d_msi]; d_msi--) {}
-    uw4 d1 = 0;
-    if(!d_msi && !d[0]) {
-      d1 = n[0]/d[0];  // d is zero => divide by zero
-      return;
-    }
-    int n_msi;  // most significant int for n
-    for(n_msi = N-1; n_msi > 0 && !n[n_msi]; n_msi--) {}
-    for(int i=0; i < Q; i++)
-      q[i] = 0;
-    for(int i=0; i < R; i++)
-      r[i] = n[i];
-    // write most significant "words" into d1
-    bool d_mss_odd = (bool) (d[d_msi] >> w1_length);
-    int d_mss= 2*d_msi + d_mss_odd;  // index to most significant short (16-bit)
-    d1 = (uw4) (uw2) d[d_msi] << (w1_length << (int) !d_mss_odd);
-    if(d_msi)
-      d1 |= (uw2) d[d_msi-1] >> (d_mss_odd ? w1_length : 0);
-    bool n_mss_odd = (bool) (n[n_msi] >> w1_length);
-    int n_mss = 2*n_msi + n_mss_odd;
-    if(n_mss < d_mss) {
-      // q already initialized to 0
-      if(R) {
-        int r_msi = AC_MIN(R-1, n_msi);
-        for(int j = 0; j <= r_msi; j++)
-          r[j] = n[j];
-        for(int j = r_msi+1; j < R; j++)
-          r[j] = 0;
-      }
-    } else {
-      uw2 r1[N+1];
-      r1[n_msi+1] = 0;
-      for(int k = n_msi; k >= 0; k--)
-        r1[k] = n[k];
-      for(int k = n_mss; k >=d_mss; k--) {
-        int k_msi = k >> 1;
-        bool odd = k & 1;
-        uw2 r1m1 = k_msi > 0 ? r1[k_msi-1] : (uw2) 0;
-        uw4 n1 = odd ?
-          (uw4) ((r1[k_msi+1] << w1_length) | (r1[k_msi] >> w1_length)) << w2_length | ((r1[k_msi] << w1_length) | (r1m1 >> w1_length)) :
-          (uw4) r1[k_msi] << w2_length | r1m1;
-        uw2 q1 = n1/d1;
-        if(q1 >> w1_length)
-          q1--;
-        AC_ASSERT(!(q1 >> w1_length), "Problem detected in long division algorithm, Please report");
-        unsigned k2 = k - d_mss;
-        unsigned k2_i = k2 >> 1;
-        bool odd_2 = k2 & 1;
-        uw2 q2 = q1 << (odd_2 ? w1_length : 0);
-        sw4 l = 0;
-        for(int j = 0; j <= d_msi; j++) {
-          l += r1[k2_i + j];
-          bool l_sign = l < 0;
-          sw4 prod = (uw4) (uw2) d[j] * (uw4) q2;
-          l -= prod;
-          bool ov1 = (l >= 0) & ((prod < 0) | l_sign);
-          bool ov2 = (l < 0) & (prod < 0) & l_sign;
-          r1[k2_i + j] = (uw2) l;
-          l >>= w2_length;
-          if(ov1)
-            l |= ((uw4) -1 << w2_length);
-          if(ov2)
-            l ^= ((sw4) 1 << w2_length);
-        }
-        if(odd_2 | d_mss_odd) {
-          l += r1[k2_i + d_msi + 1];
-          r1[k2_i + d_msi + 1] = (uw2) l;
-        }
-        if(l < 0) {
-          l = 0;
-          for(int j = 0; j <= d_msi; j++) {
-            l += (sw4) (uw2) d[j] << (odd_2 ? w1_length : 0);
-            l += r1[k2_i + j];
-            r1[k2_i + j] = (uw2) l;
-            l >>= w2_length;
-          }
-          if(odd_2 | d_mss_odd)
-            r1[k2_i + d_msi + 1] += (uw2) l;
-          q1--;
-        }
-        if(Q && k2_i < Q) {
-          if(odd_2)
-            q[k2_i] = q1 << w1_length;
-          else
-            q[k2_i] |= q1;
-        }
-      }
-      if(R) {
-        int r_msi = AC_MIN(R-1, n_msi);
-        for(int j = 0; j <= r_msi; j++)
-          r[j] = r1[j];
-        for(int j = r_msi+1; j < R; j++)
-          r[j] = 0;
-      }
-    }
-  }
-
-  template<int N1, int Num_s, int N2, int Den_s, int Nr>
-  inline void iv_div(const int *op1, const int *op2, int *r) {
-    enum { N1_over = N1+(Den_s && (Num_s==2)) };
-    if(N1_over==1 && N2==1) {
-      r[0] = op1[0] / op2[0];
-      iv_extend<Nr-N1>(r+1, ((Num_s || Den_s) && (r[0] < 0)) ? ~0 : 0);
-    }
-    else if(N1_over==1 && N2==2)
-      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(N1_over==2 && N2==1)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / ( (Slong) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / ( (Slong) op2[0]) );
-    else if(N1_over==2 && N2==2)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(!Num_s && !Den_s) {
-      iv_udiv<N1,N2,Nr,0,int,unsigned,Slong,Ulong,16>(op1, op2, r, 0);
-    }
-    else {
-      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
-      int numerator[N1_neg];
-      int denominator[N2_neg];
-      int quotient[N1_neg];
-      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
-      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
-      iv_udiv<N1_neg,N2_neg,N1_neg,0,int,unsigned,Slong,Ulong,16>(numerator, denominator, quotient, 0);
-      if( (Num_s && op1[N1-1] < 0) ^ (Den_s && op2[N2-1] < 0) )
-        iv_neg<N1_neg, Nr>(quotient, r);
-      else {
-        iv_copy<AC_MIN(N1_neg,Nr)>(quotient, r);
-        iv_extend<Nr-N1_neg>(r+N1_neg, (Num_s || Den_s) && r[N1_neg-1] < 0 ? ~0 : 0);
-      }
-    }
-  }
-
-  template<int N1, int Num_s, int N2, int Den_s, int Nr>
-  inline void iv_rem(const int *op1, const int *op2, int *r) {
-    enum { N1_over = N1+(Den_s && (Num_s==2)) };   // N1_over corresponds to the division
-    if(N1_over==1 && N2==1) {
-      r[0] = op1[0] % op2[0];
-      iv_extend<Nr-1>(r+1, Num_s && r[0] < 0 ? ~0 : 0);
-    }
-    else if(N1_over==1 && N2==2)
-      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(N1_over==2 && N2==1)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % ( (Slong) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % ( (Slong) op2[0]) );
-    else if(N1_over==2 && N2==2)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(!Num_s && !Den_s) {
-      iv_udiv<N1,N2,0,Nr,int,unsigned,Slong,Ulong,16>(op1, op2, 0, r);
-    }
-    else {
-      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
-      int numerator[N1_neg];
-      int denominator[N2_neg];
-      int remainder[N2];
-      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
-      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
-      iv_udiv<N1_neg,N2_neg,0,N2,int,unsigned,Slong,Ulong,16>(numerator, denominator, 0, remainder);
-      if( (Num_s && op1[N1-1] < 0) )
-        iv_neg<N2, Nr>(remainder, r);
-      else {
-        iv_copy<AC_MIN(N2,Nr)>(remainder, r);
-        iv_extend<Nr-N2>(r+N2, Num_s && r[N2-1] < 0 ? ~0 : 0);
-      }
-    }
-  }
-
-  template<int N>
-  inline void iv_bitwise_complement_n(const int *op, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = ~op[i];
-  }
-  template<> inline void iv_bitwise_complement_n<1>(const int *op, int *r) {
-    r[0] = ~op[0];
-  }
-  template<> inline void iv_bitwise_complement_n<2>(const int *op, int *r) {
-    r[0] = ~op[0];
-    r[1] = ~op[1];
-  }
-
-  template<int N, int Nr>
-  inline void iv_bitwise_complement(const int *op, int *r) {
-    const int M = AC_MIN(N,Nr);
-    iv_bitwise_complement_n<M>(op, r);
-    iv_extend<Nr-M>(r+M, (r[M-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N>
-  inline void iv_bitwise_and_n(const int *op1, const int *op2, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op1[i] & op2[i];
-  }
-  template<> inline void iv_bitwise_and_n<1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] & op2[0];
-  }
-  template<> inline void iv_bitwise_and_n<2>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] & op2[0];
-    r[1] = op1[1] & op2[1];
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_bitwise_and(const int *op1, const int *op2, int *r) {
-    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
-    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
-    const int *OP1 = N1 > N2 ? op1 : op2;
-    const int *OP2 = N1 > N2 ? op2 : op1;
-
-    iv_bitwise_and_n<M2>(op1, op2, r);
-    if(OP2[M2-1] < 0)
-      iv_copy<M1-M2>(OP1+M2, r+M2);
-    else
-      iv_extend<M1-M2>(r+M2, 0);
-    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N>
-  inline void iv_bitwise_or_n(const int *op1, const int *op2, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op1[i] | op2[i];
-  }
-  template<> inline void iv_bitwise_or_n<1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] | op2[0];
-  }
-  template<> inline void iv_bitwise_or_n<2>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] | op2[0];
-    r[1] = op1[1] | op2[1];
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_bitwise_or(const int *op1, const int *op2, int *r) {
-    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
-    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-
-    iv_bitwise_or_n<M2>(op1, op2, r);
-    if(OP2[M2-1] < 0)
-      iv_extend<M1-M2>(r+M2, ~0);
-    else
-      iv_copy<M1-M2>(OP1+M2, r+M2);
-    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N>
-  inline void iv_bitwise_xor_n(const int *op1, const int *op2, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op1[i] ^ op2[i];
-  }
-  template<> inline void iv_bitwise_xor_n<1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] ^ op2[0];
-  }
-  template<> inline void iv_bitwise_xor_n<2>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] ^ op2[0];
-    r[1] = op1[1] ^ op2[1];
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_bitwise_xor(const int *op1, const int *op2, int *r) {
-    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
-    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-
-    iv_bitwise_xor_n<M2>(op1, op2, r);
-    if(OP2[M2-1] < 0)
-      iv_bitwise_complement_n<M1-M2>(OP1+M2, r+M2);
-    else
-      iv_copy<M1-M2>(OP1+M2, r+M2);
-    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N, int Nr>
-  inline void iv_shift_l(const int *op1, unsigned op2, int *r) {
-    AC_ASSERT(Nr <= N, "iv_shift_l, incorrect usage Nr > N");
-    unsigned s31 = op2 & 31;
-    unsigned ishift = (op2 >> 5) > Nr ? Nr : (op2 >> 5);
-    if(s31 && ishift!=Nr) {
-      unsigned lw = 0;
-      for(unsigned i=0; i < Nr; i++) {
-        unsigned hw = (i >= ishift) ? op1[i-ishift] : 0;
-        r[i] = (hw << s31) | (lw >> (32-s31));
-        lw = hw;
-      }
-    } else {
-      for(unsigned i=0; i < Nr ; i++)
-        r[i] = (i >= ishift) ? op1[i-ishift] : 0;
-    }
-  }
-
-  template<int N, int Nr>
-  inline void iv_shift_r(const int *op1, unsigned op2, int *r) {
-    unsigned s31 = op2 & 31;
-    unsigned ishift = (op2 >> 5) > N ? N : (op2 >> 5);
-    int ext = op1[N-1] < 0 ? ~0 : 0;
-    if(s31 && ishift!=N) {
-      unsigned lw = (ishift < N) ? op1[ishift] : ext;
-      for(unsigned i=0; i < Nr; i++) {
-        unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
-        r[i] = (lw >> s31) | (hw << (32-s31));
-        lw = hw;
-      }
-    } else {
-      for(unsigned i=0; i < Nr ; i++)
-        r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
-    }
-  }
-
-  template<int N, int Nr, bool S>
-  inline void iv_shift_l2(const int *op1, signed op2, int *r) {
-    if(S && op2 < 0)
-      iv_shift_r<N,Nr>(op1, -op2, r);
-    else
-      iv_shift_l<N,Nr>(op1, op2, r);
-  }
-
-  template<> inline void iv_shift_l2<1,1,false>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0;
-  }
-  template<> inline void iv_shift_l2<1,1,true>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 >= 0) ?
-      (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0 :
-      (op2 > -32) ? (op1[0] >> -op2) : (op1[0] >> 31);
-  }
-
-  template<int N, int Nr, bool S>
-  inline void iv_shift_r2(const int *op1, signed op2, int *r) {
-    if(S && op2 < 0)
-      iv_shift_l<N,Nr>(op1, -op2, r);
-    else
-      iv_shift_r<N,Nr>(op1, op2, r);
-  }
-
-  template<> inline void iv_shift_r2<1,1,false>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31);
-  }
-  template<> inline void iv_shift_r2<1,1,true>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 >= 0) ?
-      (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31) :
-      (op2 > -32) ? ( (unsigned) op1[0] << -op2) : 0;
-  }
-
-  template<int N, int Nr, int B>
-  inline void iv_const_shift_l(const int *op1, int *r) {
-    // B >= 0
-    if(!B) {
-      const int M1 = AC_MIN(N,Nr);
-      iv_copy<M1>(op1, r);
-      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
-    }
-    else {
-      const unsigned s31 = B & 31;
-      const int ishift = (((B >> 5) > Nr) ? Nr : (B >> 5));
-      iv_extend<ishift>(r, 0);
-      const int M1 = AC_MIN(N+ishift,Nr);
-      if(s31) {
-        unsigned lw = 0;
-        for(int i=ishift; i < M1; i++) {
-          unsigned hw = op1[i-ishift];
-          r[i] = (hw << s31) | (lw >> ((32-s31)&31));  // &31 is to quiet compilers
-          lw = hw;
-        }
-        if(Nr > M1) {
-          r[M1] = (signed) lw >> ((32-s31)&31);  // &31 is to quiet compilers
-          iv_extend<Nr-M1-1>(r+M1+1, r[M1] < 0 ? ~0 : 0);
-        }
-      } else {
-        for(int i=ishift; i < M1 ; i++)
-          r[i] = op1[i-ishift];
-        iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
-      }
-    }
-  }
-  template<> inline void iv_const_shift_l<1,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-  template<> inline void iv_const_shift_l<2,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-
-  template<int N, int Nr, int B>
-  inline void iv_const_shift_r(const int *op1, int *r) {
-    if(!B) {
-      const int M1 = AC_MIN(N,Nr);
-      iv_copy<M1>(op1, r);
-      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? ~0 : 0);
-    }
-    else {
-      const unsigned s31 = B & 31;
-      const int ishift = (((B >> 5) > N) ? N : (B >> 5));
-      int ext = op1[N-1] < 0 ? ~0 : 0;
-      if(s31 && ishift!=N) {
-        unsigned lw = (ishift < N) ? op1[ishift] : ext;
-        for(int i=0; i < Nr; i++) {
-          unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
-          r[i] = (lw >> s31) | (hw << ((32-s31)&31));  // &31 is to quiet compilers
-          lw = hw;
-        }
-      } else {
-        for(int i=0; i < Nr ; i++)
-          r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
-      }
-    }
-  }
-  template<> inline void iv_const_shift_r<1,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-  template<> inline void iv_const_shift_r<2,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-
-  template<int N>
-  inline void iv_conv_from_fraction(double d, int *r, bool *qb, bool *rbits, bool *o) {
-    bool b = d < 0;
-    double d2 = b ? -d : d;
-    double dfloor = mgc_floor(d2);
-    *o = dfloor != 0.0;
-    d2 = d2 - dfloor;
-    for(int i=N-1; i >=0; i--) {
-      d2 *= (Ulong) 1 << 32;
-      unsigned k = (unsigned int) d2;
-      r[i] = b ? ~k : k;
-      d2 -= k;
-    }
-    d2 *= 2;
-    bool k = ((int) d2) != 0;  // is 0 or 1
-    d2 -= k;
-    *rbits = d2 != 0.0;
-    *qb = (b && *rbits) ^ k;
-    if(b && !*rbits && !*qb)
-      iv_uadd_carry<N>(r, true, r);
-    *o |= b ^ (r[N-1] < 0);
-  }
-
-  template<ac_base_mode b>
-  inline int to_str(int *v, int w, bool left_just, char *r) {
-    const char digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
-    const unsigned char B = b==AC_BIN ? 1 : (b==AC_OCT ? 3 : (b==AC_HEX ? 4 : 0));
-    int k = (w+B-1)/B;
-    int n = (w+31) >> 5;
-    int bits = 0;
-    if(b != AC_BIN && left_just) {
-      if( (bits = -(w % B)) )
-        r[--k] = 0;
-    }
-    for(int i = 0; i < n; i++) {
-      if (b != AC_BIN && bits < 0)
-        r[k] += (unsigned char) (( (unsigned) v[i] << (B+bits)) & (b-1));
-      unsigned int m = (unsigned) v[i] >> -bits;
-      for(bits += 32; bits > 0 && k; bits -= B) {
-        r[--k] = (char) (m & (b-1));
-        m >>= B;
-      }
-    }
-    for(int i=0; i < (w+B-1)/B; i++)
-      r[i] = digits[(int)r[i]];
-    return (w+B-1)/B;
-  }
-  template<> inline int to_str<AC_DEC>(int *v, int w, bool left_just, char *r) {
-    int k = 0;
-    int msw = (w-1) >> 5;
-    if(left_just) {
-      unsigned bits_msw = w & 31;
-      if(bits_msw) {
-        unsigned left_shift = 32 - bits_msw;
-        for(int i=msw; i > 0; i--)
-          v[i] = (unsigned) v[i] << left_shift | (unsigned) v[i-1] >> bits_msw;
-        v[0] = (unsigned) v[0] << left_shift;
-      }
-      int lsw = 0;
-      while(lsw < msw || v[msw] ) {
-        Ulong l = 0;
-        for(int i=lsw; i <= msw; i++) {
-          l += (Ulong) (unsigned) v[i] * 10;
-          v[i] = l;
-          l >>= 32;
-          if(i==lsw && !v[i])
-            lsw++;
-        }
-        r[k++] = (char) ('0' + (int) l);
-      }
-    } else {
-      const unsigned d = 1000000000;   // 10E9
-      for(; msw > 0 && !v[msw]; msw--) {}
-      while(msw >= 0) {
-        Ulong nl = 0;
-        for(int i = msw; i >= 0; i--) {
-          nl <<= 32;
-          nl |= (unsigned) v[i];
-          unsigned q = nl/d;
-          nl -= (Ulong) q * d;
-          v[i] = q;
-        }
-        if(!v[msw])
-          msw--;
-        bool last = msw == -1;
-        unsigned rem = (unsigned) nl;
-        for(int i=0; (i < 9 && !last) || rem; i++) {
-          r[k++] = (char) ('0' + (int) (rem % 10));
-          rem /= 10;
-        }
-      }
-      for(int i=0; i < k/2; i++) {
-        char c = r[i];
-        r[i] = r[k-1-i];
-        r[k-1-i] = c;
-      }
-    }
-    r[k] = 0;
-    return k;
-  }
-
-  inline int to_string(int *v, int w, bool sign_mag, ac_base_mode base, bool left_just, char *r) {
-    int n = (w+31) >> 5;
-    bool neg = !sign_mag && v[n-1] < 0;
-    if(!left_just) {
-      while(n-- && v[n] == (neg ? ~0 : 0)) {}
-      int w2 = 32*(n+1);
-      if(w2) {
-        int m = v[n];
-        for(int i = 16; i > 0; i >>= 1) {
-          if((m >> i) == (neg ? ~0 : 0))
-            w2 -= i;
-          else
-            m >>= i;
-        }
-      }
-      if(w2 < w)
-        w = w2;
-      w += !sign_mag;
-    }
-    if(base == AC_DEC)
-      return to_str<AC_DEC>(v, w, left_just, r);
-    else if (base == AC_HEX)
-      return to_str<AC_HEX>(v, w, left_just, r);
-    else if (base == AC_OCT)
-      return to_str<AC_OCT>(v, w, left_just, r);
-    else if (base == AC_BIN)
-      return to_str<AC_BIN>(v, w, left_just, r);
-    return 0;
-  }
-
-  template<int N>
-  inline unsigned iv_leading_bits(const int *op, bool bit);
-
-  template<> inline unsigned iv_leading_bits<1>(const int *op, bool bit) {
-    const unsigned char tab[] = {4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
-    unsigned t = bit ? ~*op : *op;
-    unsigned cnt = 0;
-    if(t >> 16)
-      t >>= 16;
-    else
-      cnt += 16;
-    if(t >> 8)
-      t >>= 8;
-    else
-      cnt += 8;
-    if(t >> 4)
-      t >>= 4;
-    else
-      cnt += 4;
-    cnt += tab[t];
-    return cnt;
-  }
-
-  template<int N>
-  inline unsigned iv_leading_bits(const int *op, bool bit) {
-    int ext_sign = bit ? -1 : 0;
-    int k;
-    for(k = N-1; k >= 0 && op[k] == ext_sign; k--) {}
-    return 32*(N-1-k) + (k < 0 ? 0 : iv_leading_bits<1>(op+k, bit));
-  }
-
-  //////////////////////////////////////////////////////////////////////////////
-  //  Integer Vector class: iv
-  //////////////////////////////////////////////////////////////////////////////
-  template<int N>
-  class iv {
-  protected:
-    int v[N];
-  public:
-    template<int N2> friend class iv;
-    iv() {}
-    template<int N2>
-    iv ( const iv<N2> &b ) {
-      const int M = AC_MIN(N,N2);
-      iv_copy<M>(b.v, v);
-      iv_extend<N-M>(v+M, (v[M-1] < 0) ? ~0 : 0);
-    }
-    iv ( Slong t) {
-      iv_assign_int64<N>(v, t);
-    }
-    iv ( Ulong t) {
-      iv_assign_uint64<N>(v, t);
-    }
-    iv ( int t) {
-      v[0] = t;
-      iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
-    }
-    iv ( unsigned int t) {
-      v[0] = t;
-      iv_extend<N-1>(v+1, 0);
-    }
-    iv ( long t) {
-      if(long_w == 32) {
-        v[0] = t;
-        iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
-      } else
-        iv_assign_int64<N>(v, t);
-    }
-    iv ( unsigned long t) {
-      if(long_w == 32) {
-        v[0] = t;
-        iv_extend<N-1>(v+1, 0);
-      } else
-        iv_assign_uint64<N>(v, t);
-    }
-    iv ( double d ) {
-      double d2 = ldexpr32<-N>(d);
-      bool qb, rbits, o;
-      iv_conv_from_fraction<N>(d2, v, &qb, &rbits, &o);
-    }
-
-    // Explicit conversion functions to C built-in types -------------
-    inline Slong to_int64() const { return N==1 ? v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
-    inline Ulong to_uint64() const { return N==1 ? (Ulong) v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
-    inline double to_double() const {
-      double a = v[N-1];
-      for(int i=N-2; i >= 0; i--) {
-        a *= (Ulong) 1 << 32;
-        a += (unsigned) v[i];
-      }
-      return a;
-    }
-    inline void conv_from_fraction(double d, bool *qb, bool *rbits, bool *o) {
-      iv_conv_from_fraction<N>(d, v, qb, rbits, o);
-    }
-
-    template<int N2, int Nr>
-    inline void mult(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_mult<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void add(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_add<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void sub(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_sub<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int Num_s, int N2, int Den_s, int Nr>
-    void div(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_div<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
-    }
-    template<int Num_s, int N2, int Den_s, int Nr>
-    void rem(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_rem<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
-    }
-    void increment() {
-      iv_uadd_carry<N>(v, true, v);
-    }
-    void decrement() {
-      iv_sub_int_borrow<N>(v, 0, true, v);
-    }
-    template<int Nr>
-    void neg(iv<Nr> &r) const {
-      iv_neg<N,Nr>(v, r.v);
-    }
-    template<int Nr>
-    void shift_l(unsigned op2, iv<Nr> &r) const {
-      iv_shift_l<N,Nr>(v, op2, r.v);
-    }
-    template<int Nr>
-    void shift_l2(signed op2, iv<Nr> &r) const {
-      iv_shift_l2<N,Nr,true>(v, op2, r.v);
-    }
-    template<int Nr>
-    void shift_r(unsigned op2, iv<Nr> &r) const {
-      iv_shift_r<N,Nr>(v, op2, r.v);
-    }
-    template<int Nr>
-    void shift_r2(signed op2, iv<Nr> &r) const {
-      iv_shift_r2<N,Nr,true>(v, op2, r.v);
-    }
-    template<int Nr, int B>
-    void const_shift_l(iv<Nr> &r) const {
-      iv_const_shift_l<N,Nr,B>(v, r.v);
-    }
-    template<int Nr, int B>
-    void const_shift_r(iv<Nr> &r) const {
-      iv_const_shift_r<N,Nr,B>(v, r.v);
-    }
-    template<int Nr>
-    void bitwise_complement(iv<Nr> &r) const {
-      iv_bitwise_complement<N,Nr>(v, r.v);
-    }
-    template<int N2, int Nr>
-    void bitwise_and(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_bitwise_and<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void bitwise_or(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_bitwise_or<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void bitwise_xor(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_bitwise_xor<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2>
-    bool equal(const iv<N2> &op2) const {
-      return iv_equal<N,N2>(v, op2.v);
-    }
-    template<int N2>
-    bool greater_than(const iv<N2> &op2) const {
-      return iv_compare<N,N2,true>(v, op2.v);
-    }
-    template<int N2>
-    bool less_than(const iv<N2> &op2) const {
-      return iv_compare<N,N2,false>(v, op2.v);
-    }
-    bool equal_zero() const {
-      return iv_equal_zero<N>(v);
-    }
-    template<int N2>
-    void set_slc(unsigned lsb, int WS, const iv<N2> &op2) {
-      AC_ASSERT((31+WS)/32 == N2, "Bad usage: WS greater than length of slice");
-      unsigned msb = lsb+WS-1;
-      unsigned lsb_v = lsb >> 5;
-      unsigned lsb_b = lsb & 31;
-      unsigned msb_v = msb >> 5;
-      unsigned msb_b = msb & 31;
-      if(N2==1) {
-        if(msb_v == lsb_v)
-          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb_b);
-        else {
-          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
-          unsigned m = (((unsigned) op2.v[0] >> 1) >> (31-lsb_b));
-          v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
-        }
-      } else {
-        v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
-        for(int i = 1; i < N2-1; i++)
-          v[lsb_v+i] = ((unsigned) op2.v[i] << lsb_b) | (((unsigned) op2.v[i-1] >> 1) >> (31-lsb_b));
-        unsigned t = ((unsigned) op2.v[N2-1] << lsb_b) | (((unsigned) op2.v[N2-2] >> 1) >> (31-lsb_b));
-        unsigned m;
-        if(msb_v-lsb_v == N2) {
-          v[msb_v-1] = t;
-          m = (((unsigned) op2.v[N2-1] >> 1) >> (31-lsb_b));
-        }
-        else
-          m = t;
-        v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
-      }
-    }
-    unsigned leading_bits(bool bit) const {
-      return iv_leading_bits<N>(v, bit);
-    }
-  };
-
-  template<> inline Slong iv<1>::to_int64() const { return v[0]; }
-  template<> inline Ulong iv<1>::to_uint64() const { return v[0]; }
-
-  template<> inline Slong iv<2>::to_int64() const {
-    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
-  }
-  template<> inline Ulong iv<2>::to_uint64() const {
-    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
-  }
-
-  template<> template<> inline void iv<1>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
-    v[0] ^= (v[0] ^ ((unsigned) op2.v[0] << lsb)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb);
-  }
-  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
-    Ulong l = to_uint64();
-    Ulong l2 = op2.to_uint64();
-    l ^= (l ^ (l2 << lsb)) & (~((~(Ulong)0)<<WS) << lsb);  // WS <= 32
-    *this = l;
-  }
-  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<2> &op2) {
-    Ulong l = to_uint64();
-    Ulong l2 = op2.to_uint64();
-    l ^= (l ^ (l2 << lsb)) & (~(WS==64 ? (Ulong) 0 : ~(Ulong)0<<WS) << lsb);
-    *this = l;
-  }
-
-  // add automatic conversion to Slong/Ulong depending on S and C
-  template<int N, bool S, bool C>
-  class iv_conv : public iv<N> {
-  protected:
-    iv_conv() {}
-    template<class T> iv_conv(const T& t) : iv<N>(t) {}
-  };
-
-  template<int N>
-  class iv_conv<N,false,true> : public iv<N> {
-  public:
-    operator Ulong () const { return iv<N>::to_uint64(); }
-  protected:
-    iv_conv() {}
-    template<class T> iv_conv(const T& t) : iv<N>(t) {}
-  };
-
-  template<int N>
-  class iv_conv<N,true,true> : public iv<N> {
-  public:
-    operator Slong () const { return iv<N>::to_int64(); }
-  protected:
-    iv_conv() {}
-    template<class T> iv_conv(const T& t) : iv<N>(t) {}
-  };
-
-  // Set default to promote to int as this is the case for almost all types
-  //  create exceptions using specializations
-  template<typename T>
-  struct c_prom {
-    typedef int promoted_type;
-  };
-  template<> struct c_prom<unsigned> {
-    typedef unsigned promoted_type;
-  };
-  template<> struct c_prom<long> {
-    typedef long promoted_type;
-  };
-  template<> struct c_prom<unsigned long> {
-    typedef unsigned long promoted_type;
-  };
-  template<> struct c_prom<Slong> {
-    typedef Slong promoted_type;
-  };
-  template<> struct c_prom<Ulong> {
-    typedef Ulong promoted_type;
-  };
-  template<> struct c_prom<float> {
-    typedef float promoted_type;
-  };
-  template<> struct c_prom<double> {
-    typedef double promoted_type;
-  };
-
-  template<typename T, typename T2>
-  struct c_arith {
-     // will error out for pairs of T and T2 that are not defined through specialization
-  };
-  template<typename T> struct c_arith<T,T> {
-    typedef T arith_conv;
-  };
-
-  #define C_ARITH(C_TYPE1, C_TYPE2) \
-  template<> struct c_arith<C_TYPE1, C_TYPE2> { \
-    typedef C_TYPE1 arith_conv; \
-  }; \
-  template<> struct c_arith<C_TYPE2, C_TYPE1> { \
-    typedef C_TYPE1 arith_conv; \
-  };
-
-  C_ARITH(double, float)
-  C_ARITH(double, int)
-  C_ARITH(double, unsigned)
-  C_ARITH(double, long)
-  C_ARITH(double, unsigned long)
-  C_ARITH(double, Slong)
-  C_ARITH(double, Ulong)
-  C_ARITH(float, int)
-  C_ARITH(float, unsigned)
-  C_ARITH(float, long)
-  C_ARITH(float, unsigned long)
-  C_ARITH(float, Slong)
-  C_ARITH(float, Ulong)
-
-  C_ARITH(Slong, int)
-  C_ARITH(Slong, unsigned)
-  C_ARITH(Ulong, int)
-  C_ARITH(Ulong, unsigned)
-
-  template<typename T>
-  struct map {
-    typedef T t;
-  };
-  template<typename T>
-  struct c_type_params {
-    // will error out for T for which this template struct is not specialized
-  };
-
-  template<typename T> inline const char *c_type_name() { return "unknown"; }
-  template<> inline const char *c_type_name<bool>() { return "bool";}
-  template<> inline const char *c_type_name<char>() { return "char";}
-  template<> inline const char *c_type_name<signed char>() { return "signed char";}
-  template<> inline const char *c_type_name<unsigned char>() { return "unsigned char";}
-  template<> inline const char *c_type_name<signed short>() { return "signed short";}
-  template<> inline const char *c_type_name<unsigned short>() { return "unsigned short";}
-  template<> inline const char *c_type_name<int>() { return "int";}
-  template<> inline const char *c_type_name<unsigned>() { return "unsigned";}
-  template<> inline const char *c_type_name<signed long>() { return "signed long";}
-  template<> inline const char *c_type_name<unsigned long>() { return "unsigned long";}
-  template<> inline const char *c_type_name<signed long long>() { return "signed long long";}
-  template<> inline const char *c_type_name<unsigned long long>() { return "unsigned long long";}
-  template<> inline const char *c_type_name<float>() { return "float";}
-  template<> inline const char *c_type_name<double>() { return "double";}
-
-  template<typename T> struct c_type;
-
-  template<typename T>
-  struct rt_c_type_T {
-    template<typename T2>
-    struct op1 {
-      typedef typename T::template rt_T< c_type<T2> >::mult mult;
-      typedef typename T::template rt_T< c_type<T2> >::plus plus;
-      typedef typename T::template rt_T< c_type<T2> >::minus2 minus;
-      typedef typename T::template rt_T< c_type<T2> >::minus minus2;
-      typedef typename T::template rt_T< c_type<T2> >::logic logic;
-      typedef typename T::template rt_T< c_type<T2> >::div2 div;
-      typedef typename T::template rt_T< c_type<T2> >::div div2;
-    };
-  };
-  template<typename T>
-  struct c_type {
-    typedef typename c_prom<T>::promoted_type c_prom_T;
-    struct rt_unary {
-      typedef c_prom_T neg;
-      typedef c_prom_T mag_sqr;
-      typedef c_prom_T mag;
-      template<unsigned N>
-      struct set {
-        typedef c_prom_T sum;
-      };
-    };
-    template<typename T2>
-    struct rt_T {
-      typedef typename rt_c_type_T<T2>::template op1<T>::mult mult;
-      typedef typename rt_c_type_T<T2>::template op1<T>::plus plus;
-      typedef typename rt_c_type_T<T2>::template op1<T>::minus minus;
-      typedef typename rt_c_type_T<T2>::template op1<T>::minus2 minus2;
-      typedef typename rt_c_type_T<T2>::template op1<T>::logic logic;
-      typedef typename rt_c_type_T<T2>::template op1<T>::div div;
-      typedef typename rt_c_type_T<T2>::template op1<T>::div2 div2;
-    };
-    inline static std::string type_name() {
-      std::string r = c_type_name<T>();
-      return r;
-    }
-
-  };
-  // with T == c_type
-  template<typename T>
-  struct rt_c_type_T< c_type<T> > {
-    typedef typename c_prom<T>::promoted_type c_prom_T;
-    template<typename T2>
-    struct op1 {
-      typedef typename c_prom<T2>::promoted_type c_prom_T2;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv mult;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv plus;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus2;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv logic;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div2;
-    };
-  };
-
-  #define C_TYPE_MAP(C_TYPE) \
-  template<> struct map<C_TYPE> { \
-    typedef c_type<C_TYPE> t; \
-  };
-
-  #define C_TYPE_PARAMS(C_TYPE, WI, SI) \
-  template<> struct c_type_params<C_TYPE> { \
-    enum { W = WI, I = WI, E = 0, S = SI, floating_point = 0 }; \
-  };
-
-  #define C_TYPE_MAP_INT(C_TYPE, WI, SI) \
-    C_TYPE_MAP(C_TYPE) \
-    C_TYPE_PARAMS(C_TYPE, WI, SI)
-
-  #define C_TYPE_MAP_FLOAT(C_TYPE, FP, WFP, IFP, EFP) \
-  C_TYPE_MAP(C_TYPE) \
-  template<> struct c_type_params<C_TYPE> { \
-    enum { W = WFP, I = IFP, E = EFP, S = true, floating_point = FP }; \
-  };
-
-  C_TYPE_MAP_INT(bool, 1, false)
-  C_TYPE_MAP_INT(char, 8, true)
-  C_TYPE_MAP_INT(signed char, 8, true)
-  C_TYPE_MAP_INT(unsigned char, 8, false)
-  C_TYPE_MAP_INT(signed short, 16, true)
-  C_TYPE_MAP_INT(unsigned short, 16, false)
-  C_TYPE_MAP_INT(signed int, 32, true)
-  C_TYPE_MAP_INT(unsigned int, 32, false)
-  C_TYPE_MAP_INT(signed long, ac_private::long_w, true)
-  C_TYPE_MAP_INT(unsigned long, ac_private::long_w, false)
-  C_TYPE_MAP_INT(signed long long, 64, true)
-  C_TYPE_MAP_INT(unsigned long long, 64, false)
-  C_TYPE_MAP_FLOAT(float, 1, 25, 1, 8)
-  C_TYPE_MAP_FLOAT(double, 2, 54, 1, 11)
-
-  #undef C_TYPE_INT
-  #undef C_TYPE_PARAMS
-  #undef C_TYPE_FLOAT
-  #undef C_TYPE_MAP
-
-  // specializations for following struct declared/defined after definition of ac_int
-  template<typename T>
-  struct rt_ac_int_T {
-    template<int W, bool S>
-    struct op1 {
-      typedef typename T::template rt_T< ac_int<W,S> >::mult mult;
-      typedef typename T::template rt_T< ac_int<W,S> >::plus plus;
-      typedef typename T::template rt_T< ac_int<W,S> >::minus2 minus;
-      typedef typename T::template rt_T< ac_int<W,S> >::minus minus2;
-      typedef typename T::template rt_T< ac_int<W,S> >::logic logic;
-      typedef typename T::template rt_T< ac_int<W,S> >::div2 div;
-      typedef typename T::template rt_T< ac_int<W,S> >::div div2;
-    };
-  };
-}
-
-namespace ac {
-  // compiler time constant for log2 like functions
-  template<unsigned X>
-  struct nbits {
-    enum { val = X ? ac_private::s_N<16>::s_X<X>::nbits : 1 };
-  };
-
-  template<unsigned X>
-  struct log2_floor {
-    enum { val = nbits<X>::val - 1 };
-  };
-
-  // log2 of 0 is not defined: generate compiler error
-  template<> struct log2_floor<0> {};
-
-  template<unsigned X>
-  struct log2_ceil {
-    enum { lf = log2_floor<X>::val, val = (X == (1 << lf) ? lf : lf+1) };
-  };
-
-  // log2 of 0 is not defined: generate compiler error
-  template<> struct log2_ceil<0> {};
-
-  template<int LowerBound, int UpperBound>
-  struct int_range {
-    enum { l_s = (LowerBound < 0), u_s = (UpperBound < 0),
-           signedness = l_s || u_s,
-           l_nbits = nbits<AC_ABS(LowerBound+l_s)+l_s>::val,
-           u_nbits = nbits<AC_ABS(UpperBound+u_s)+u_s>::val,
-           nbits = AC_MAX(l_nbits, u_nbits + (!u_s && signedness))
-         };
-    typedef ac_int<nbits, signedness> type;
-  };
-
-  template<int W, int P, bool Is_MSB, bool S>
-  class sliceref {
-# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-# pragma builtin
-# endif
-    int *d_iv;
-    template<int W2, int P2, bool Is_MSB2, bool S2> friend class sliceref;
-  public:
-    sliceref( int *iv ) : d_iv(iv) {}
-
-    inline const sliceref operator = ( const sliceref &val ) {
-      return operator=<P,Is_MSB,S>(val);
-    }
-
-    template<int P2, bool Is_MSB2, bool S2>
-    inline const sliceref operator = ( const sliceref<W,P2,Is_MSB2,S2> &val ) {
-      const int src_lsi = P2/32;
-      const int src_msi = (P2+W-1)/32;
-      const int trg_lsi = P/32;
-      const int trg_msi = (P+W-1)/32;
-      const int trg_lsb = P&31;
-      const int trg_msb = (P+W-1)&31;
-      const int N = src_msi-src_lsi+1;
-      const int Nr = trg_msi-trg_lsi+1;
-      const int rshift = (P2&31) - (P&31);
-      int shifted_src[Nr];
-      int *aligned_src = val.d_iv+src_lsi;
-      if(rshift) {
-        if(rshift < 0)
-          ac_private::iv_shift_l<N,Nr>(aligned_src, -rshift, shifted_src);
-        else
-          ac_private::iv_shift_r<N,Nr>(aligned_src, rshift, shifted_src);
-        aligned_src = shifted_src;
-      }
-      unsigned mask_lsi = ac_private::all_ones << trg_lsb;
-      unsigned mask_msi = ac_private::all_ones >> (31-trg_msb);
-      if(Nr==1)
-        mask_lsi &= mask_msi;
-      int *v = d_iv+trg_lsi;
-      v[0] ^= (v[0] ^ ((unsigned) aligned_src[0])) & mask_lsi;
-      for(int k=1; k < Nr-1; k++)
-        v[k] = aligned_src[k];
-      if(Nr > 1)
-        v[Nr-1] ^= (v[Nr-1] ^ ((unsigned) aligned_src[Nr-1])) & mask_msi;
-      if(Is_MSB) {
-        const unsigned rem = 31-trg_msb;
-        if(rem) {
-          v[Nr-1] =  S ? ((signed) ((unsigned) v[Nr-1]  << rem) >> rem)
-                       : ((unsigned) v[Nr-1]  << rem) >> rem;
-        } else if(!S) {
-          v[Nr] = 0;
-        }
-      }
-      return *this;
-    }
-  };
-}
-
-enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
-enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
-template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> class ac_fixed;
-
-//////////////////////////////////////////////////////////////////////////////
-//  Arbitrary-Length Integer: ac_int
-//////////////////////////////////////////////////////////////////////////////
-
-template<int W, bool S=true>
-class ac_int : public ac_private::iv_conv<(W+31+!S)/32, S, W<=64>
-#ifndef __SYNTHESIS__
-__AC_INT_UTILITY_BASE
-#endif
-{
-#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-#pragma builtin
-#endif
-
-  enum {N=(W+31+!S)/32};
-  typedef ac_private::iv_conv<N, S, W <= 64> ConvBase;
-  typedef ac_private::iv<N>                  Base;
-
-  inline void bit_adjust() {
-    const unsigned rem = (32-W)&31;
-    Base::v[N-1] =  S ? ((signed) ((unsigned) Base::v[N-1]  << rem) >> rem) : (rem ?
-                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
-  }
-
-  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
-
-  // returns false if number is denormal
-  template<int WE, bool SE>
-  bool normalize_private(ac_int<WE,SE> &exp, bool reserved_min_exp=false) {
-    int expt = exp;
-    int lshift = leading_sign();
-    bool fully_normalized = true;
-    ac_int<WE, SE> min_exp;
-    min_exp.template set_val<AC_VAL_MIN>();
-    int max_shift = exp - min_exp - reserved_min_exp;
-    if(lshift > max_shift) {
-      lshift = ac_int<WE,false>(max_shift);
-      expt = min_exp + reserved_min_exp;
-      fully_normalized = false;
-    } else {
-      expt -= lshift;
-    }
-    if(Base::equal_zero()) {
-      expt = 0;
-      fully_normalized = true;
-    }
-    exp = expt;
-    Base r;
-    Base::shift_l(lshift, r);
-    Base::operator=(r);
-    bit_adjust();
-    return fully_normalized;
-  }
-
-public:
-  static const int width = W;
-  static const int i_width = W;
-  static const bool sign = S;
-  static const ac_q_mode q_mode = AC_TRN;
-  static const ac_o_mode o_mode = AC_WRAP;
-  static const int e_width = 0;
-
-  template<int W2, bool S2>
-  struct rt {
-    enum {
-      mult_w = W+W2,
-      mult_s = S||S2,
-      plus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
-      plus_s = S||S2,
-      minus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
-      minus_s = true,
-      div_w = W+S2,
-      div_s = S||S2,
-      mod_w = AC_MIN(W,W2+(!S2&&S)),
-      mod_s = S,
-      logic_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2)),
-      logic_s = S||S2
-    };
-    typedef ac_int<mult_w, mult_s> mult;
-    typedef ac_int<plus_w, plus_s> plus;
-    typedef ac_int<minus_w, minus_s> minus;
-    typedef ac_int<logic_w, logic_s> logic;
-    typedef ac_int<div_w, div_s> div;
-    typedef ac_int<mod_w, mod_s> mod;
-    typedef ac_int<W, S> arg1;
-  };
-
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::mult mult;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::plus plus;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus minus;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus2 minus2;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::logic logic;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div div;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div2 div2;
-    typedef ac_int<W, S> arg1;
-  };
-
-  struct rt_unary {
-    enum {
-      neg_w = W+1,
-      neg_s = true,
-      mag_sqr_w = 2*W-S,
-      mag_sqr_s = false,
-      mag_w = W+S,
-      mag_s = false,
-      leading_sign_w = ac::log2_ceil<W+!S>::val,
-      leading_sign_s = false
-    };
-    typedef ac_int<neg_w, neg_s> neg;
-    typedef ac_int<mag_sqr_w, mag_sqr_s> mag_sqr;
-    typedef ac_int<mag_w, mag_s> mag;
-    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
-    template<unsigned N>
-    struct set {
-      enum { sum_w = W + ac::log2_ceil<N>::val, sum_s = S};
-      typedef ac_int<sum_w, sum_s> sum;
-    };
-  };
-
-  template<int W2, bool S2> friend class ac_int;
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
-  ac_int() {
-#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
-    bit_adjust();
-#endif
-  }
-  template<int W2, bool S2>
-  inline ac_int (const ac_int<W2,S2> &op) {
-    Base::operator =(op);
-    bit_adjust();
-  }
-
-  inline ac_int( bool b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( char b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed char b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned char b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed short b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned short b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed int b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned int b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed long b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned long b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( Slong b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( Ulong b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( double d ) : ConvBase(d) { bit_adjust(); }
-
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-  template<ac_special_val V>
-  inline ac_int &set_val() {
-    const unsigned int all_ones = (unsigned) ~0;
-    if(V == AC_VAL_DC) {
-      ac_int r;
-      Base::operator =(r);
-      bit_adjust();
-    }
-    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-      Base::operator =(0);
-      if(S && V == AC_VAL_MIN) {
-        const unsigned int rem = (W-1)&31;
-        Base::v[N-1] = (all_ones << rem);
-      } else if(V == AC_VAL_QUANTUM)
-        Base::v[0] = 1;
-    }
-    else {  // AC_VAL_MAX
-      Base::operator =(-1);
-      const unsigned int rem = (32-W - !S )&31;
-      Base::v[N-1] = (all_ones >> 1) >> rem;
-    }
-    return *this;
-  }
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-  // Explicit conversion functions to C built-in types -------------
-  inline int to_int() const { return Base::v[0]; }
-  inline unsigned to_uint() const { return Base::v[0]; }
-  inline long to_long() const {
-    return ac_private::long_w == 32 ? (long) Base::v[0] : (long) Base::to_int64();
-  }
-  inline unsigned long to_ulong() const {
-    return ac_private::long_w == 32 ? (unsigned long) Base::v[0] : (unsigned long) Base::to_uint64();
-  }
-  inline Slong to_int64() const { return Base::to_int64(); }
-  inline Ulong to_uint64() const { return Base::to_uint64(); }
-  inline double to_double() const { return Base::to_double(); }
-
-  inline int length() const { return W; }
-
-  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
-    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
-    char r[N*32+4] = {0};
-    int i = 0;
-    if(sign_mag)
-      r[i++] = is_neg() ? '-' : '+';
-    else if (base_rep == AC_DEC && is_neg())
-      r[i++] = '-';
-    if(base_rep != AC_DEC) {
-      r[i++] = '0';
-      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
-    }
-    int str_w;
-    if( (base_rep == AC_DEC || sign_mag) && is_neg() ) {
-      ac_int<W, false>  mag = operator -();
-      str_w = ac_private::to_string(mag.v, W+1, sign_mag, base_rep, false, r+i);
-    } else {
-      ac_int<W,S> tmp = *this;
-      str_w = ac_private::to_string(tmp.v, W+!S, sign_mag, base_rep, false, r+i);
-    }
-    if(!str_w) {
-      r[i] = '0';
-      r[i+1] = 0;
-    }
-    return std::string(r);
-  }
-  inline static std::string type_name() {
-    const char *tf[] = {",false>", ",true>"};
-    std::string r = "ac_int<";
-    r += ac_int<32,true>(W).to_string(AC_DEC);
-    r += tf[S];
-    return r;
-  }
-
-  // Arithmetic : Binary ----------------------------------------------------
-  template<int W2, bool S2>
-  typename rt<W2,S2>::mult operator *( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::mult r;
-    Base::mult(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::plus operator +( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::plus r;
-    Base::add(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::minus operator -( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::minus r;
-    Base::sub(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wenum-compare"
-#endif
-  template<int W2, bool S2>
-  typename rt<W2,S2>::div operator /( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::div r;
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::div::N };
-    Base::template div<num_s, N2, den_s, Nr>(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::mod operator %( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::mod r;
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::mod::N };
-    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-  // Arithmetic assign  ------------------------------------------------------
-  template<int W2, bool S2>
-  ac_int &operator *=( const ac_int<W2,S2> &op2) {
-    Base r;
-    Base::mult(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator +=( const ac_int<W2,S2> &op2) {
-    Base r;
-    Base::add(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator -=( const ac_int<W2,S2> &op2) {
-    Base r;
-    Base::sub(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wenum-compare"
-#endif
-  template<int W2, bool S2>
-  ac_int &operator /=( const ac_int<W2,S2> &op2) {
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
-    Base r;
-    Base::template div<num_s, N2, den_s, Nr>(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator %=( const ac_int<W2,S2> &op2) {
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
-    Base r;
-    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-  // Arithmetic prefix increment, decrement ----------------------------------
-  ac_int &operator ++() {
-    Base::increment();
-    bit_adjust();
-    return *this;
-  }
-  ac_int &operator --() {
-    Base::decrement();
-    bit_adjust();
-    return *this;
-  }
-  // Arithmetic postfix increment, decrement ---------------------------------
-  const ac_int operator ++(int) {
-    ac_int t = *this;
-    Base::increment();
-    bit_adjust();
-    return t;
-  }
-  const ac_int operator --(int) {
-    ac_int t = *this;
-    Base::decrement();
-    bit_adjust();
-    return t;
-  }
-  // Arithmetic Unary --------------------------------------------------------
-  ac_int operator +() {
-    return *this;
-  }
-  typename rt_unary::neg operator -() const {
-    typename rt_unary::neg r;
-    Base::neg(r);
-    r.bit_adjust();
-    return r;
-  }
-  // ! ------------------------------------------------------------------------
-  bool operator ! () const {
-    return Base::equal_zero();
-  }
-
-  // Bitwise (arithmetic) unary: complement  -----------------------------
-  ac_int<W+!S, true> operator ~() const {
-    ac_int<W+!S, true> r;
-    Base::bitwise_complement(r);
-    return r;
-  }
-  // Bitwise (non-arithmetic) bit_complement  -----------------------------
-  ac_int<W, false> bit_complement() const {
-    ac_int<W, false> r;
-    Base::bitwise_complement(r);
-    r.bit_adjust();
-    return r;
-  }
-  // Bitwise (arithmetic): and, or, xor ----------------------------------
-  template<int W2, bool S2>
-  typename rt<W2,S2>::logic operator & ( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::logic r;
-    Base::bitwise_and(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::logic operator | ( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::logic r;
-    Base::bitwise_or(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::logic operator ^ ( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::logic r;
-    Base::bitwise_xor(op2, r);
-    return r;
-  }
-  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
-  template<int W2, bool S2>
-  ac_int &operator &= ( const ac_int<W2,S2> &op2 ) {
-    Base r;
-    Base::bitwise_and(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator |= ( const ac_int<W2,S2> &op2 ) {
-    Base r;
-    Base::bitwise_or(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator ^= ( const ac_int<W2,S2> &op2 ) {
-    Base r;
-    Base::bitwise_xor(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  // Shift (result constrained by left operand) -------------------------------
-  template<int W2>
-  ac_int operator << ( const ac_int<W2,true> &op2 ) const {
-    ac_int r;
-    Base::shift_l2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_int operator << ( const ac_int<W2,false> &op2 ) const {
-    ac_int r;
-    Base::shift_l(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_int operator >> ( const ac_int<W2,true> &op2 ) const {
-    ac_int r;
-    Base::shift_r2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_int operator >> ( const ac_int<W2,false> &op2 ) const {
-    ac_int r;
-    Base::shift_r(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  // Shift assign ------------------------------------------------------------
-  template<int W2>
-  ac_int &operator <<= ( const ac_int<W2,true> &op2 ) {
-    Base r;
-    Base::shift_l2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_int &operator <<= ( const ac_int<W2,false> &op2 ) {
-    Base r;
-    Base::shift_l(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_int &operator >>= ( const ac_int<W2,true> &op2 ) {
-    Base r;
-    Base::shift_r2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_int &operator >>= ( const ac_int<W2,false> &op2 ) {
-    Base r;
-    Base::shift_r(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  // Relational ---------------------------------------------------------------
-  template<int W2, bool S2>
-  bool operator == ( const ac_int<W2,S2> &op2) const {
-    return Base::equal(op2);
-  }
-  template<int W2, bool S2>
-  bool operator != ( const ac_int<W2,S2> &op2) const {
-    return !Base::equal(op2);
-  }
-  template<int W2, bool S2>
-  bool operator < ( const ac_int<W2,S2> &op2) const {
-    return Base::less_than(op2);
-  }
-  template<int W2, bool S2>
-  bool operator >= ( const ac_int<W2,S2> &op2) const {
-    return !Base::less_than(op2);
-  }
-  template<int W2, bool S2>
-  bool operator > ( const ac_int<W2,S2> &op2) const {
-    return Base::greater_than(op2);
-  }
-  template<int W2, bool S2>
-  bool operator <= ( const ac_int<W2,S2> &op2) const {
-    return !Base::greater_than(op2);
-  }
-
-  // Bit and Slice Select -----------------------------------------------------
-  template<int WS, int WX, bool SX>
-  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int WS>
-  inline const ac_int<WS,S> slc(signed index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int WS>
-  inline const ac_int<WS,S> slc(unsigned uindex) const {
-    ac_int<WS,S> r;
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int W2, bool S2, int WX, bool SX>
-  inline ac_int &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_int &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_int &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-
-  template<int Msb, int Lsb>
-  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
-    #if __cplusplus > 199711L
-    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
-    static_assert(Lsb >= 0, "LSB is negative");
-    static_assert(Msb < W, "MSB >= W");
-    #endif
-    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
-  }
-
-  class ac_bitref {
-# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-# pragma builtin
-# endif
-    ac_int &d_bv;
-    unsigned d_index;
-  public:
-    ac_bitref( ac_int *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
-    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
-
-    template<int W2, bool S2>
-    operator ac_int<W2,S2> () const { return operator bool (); }
-
-    inline ac_bitref operator = ( int val ) {
-      // lsb of int (val&1) is written to bit
-      if(d_index < W) {
-        int *pval = &d_bv.v[d_index>>5];
-        *pval ^= (*pval ^ ( (unsigned) val << (d_index&31) )) & 1 << (d_index&31);
-        d_bv.bit_adjust();   // in case sign bit was assigned
-      }
-      return *this;
-    }
-    template<int W2, bool S2>
-    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
-      return operator =(val.to_int());
-    }
-    inline ac_bitref operator = ( const ac_bitref &val ) {
-      return operator =((int) (bool) val);
-    }
-  };
-
-  ac_bitref operator [] ( unsigned int uindex) {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  ac_bitref operator [] ( int index) {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  template<int W2, bool S2>
-  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  bool operator [] ( unsigned int uindex) const {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  bool operator [] ( int index) const {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  template<int W2, bool S2>
-  bool operator [] ( const ac_int<W2,S2> &index) const {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-
-  typename rt_unary::leading_sign leading_sign() const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    return ls;
-  }
-  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    all_sign = (ls == W-S);
-    return ls;
-  }
-  // returns false if number is denormal
-  template<int WE, bool SE>
-  bool normalize(ac_int<WE,SE> &exp) {
-    return normalize_private(exp);
-  }
-  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
-  template<int WE, bool SE>
-  bool normalize_RME(ac_int<WE,SE> &exp) {
-    return normalize_private(exp, true);
-  }
-  bool and_reduce() const {
-    return ac_private::iv_equal_ones_to<W,N>(Base::v);
-  }
-  bool or_reduce() const {
-    return !Base::equal_zero();
-  }
-  bool xor_reduce() const {
-    unsigned r = Base::v[N-1];
-    if(S) {
-      const unsigned rem = (32-W)&31;
-      r = (r << rem) >> rem;
-    }
-    if(N > 1)
-      r ^= Base::v[N-2];
-    if(N > 2) {
-      for(int i=0; i<N-2; i++)
-        r ^= Base::v[i];
-    }
-    if(W > 16)
-      r ^= r >> 16;
-    if(W > 8)
-      r ^= r >> 8;
-    if(W > 4)
-      r ^= r >> 4;
-    if(W > 2)
-      r ^= r >> 2;
-    if(W > 1)
-    r ^= r >> 1;
-    return r&1;
-  }
-
-  inline void bit_fill_hex(const char *str) {
-    // Zero Pads if str is too short, throws ms bits away if str is too long
-    // Asserts if anything other than 0-9a-fA-F is encountered
-    ac_int<W,S> res = 0;
-    while(*str) {
-      char c = *str;
-      int h = 0;
-      if(c >= '0' && c <= '9')
-        h = c - '0';
-      else if(c >= 'A' && c <= 'F')
-        h = c - 'A' + 10;
-      else if(c >= 'a' && c <= 'f')
-        h = c - 'a' + 10;
-      else {
-        AC_ASSERT(!c, "Invalid hex digit");
-        break;
-      }
-      res <<= ac_int<3,false>(4);
-      res |= ac_int<4,false>(h);
-      str++;
-    }
-    *this = res;
-  }
-
-  template<int Na>
-  inline void bit_fill(const int (&ivec)[Na], bool bigendian=true) {
-    // bit_fill from integer vector
-    //   if W > N*32, missing most significant bits are zeroed
-    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
-    // Example:
-    //   ac_int<80,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
-    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
-    enum { N0 = (W+31)/32, M = AC_MIN(N0,Na) };
-    ac_int<M*32,false> res = 0;
-    for(int i=0; i < M; i++)
-      res.set_slc(i*32, ac_int<32>(ivec[bigendian ? M-1-i : i]));
-    *this = res;
-  }
-};
-
-namespace ac {
-  template<typename T, typename T2>
-  struct rt_2T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::map<T2>::t map_T2;
-    typedef typename map_T::template rt_T< map_T2 >::mult mult;
-    typedef typename map_T::template rt_T< map_T2 >::plus plus;
-    typedef typename map_T::template rt_T< map_T2 >::minus minus;
-    typedef typename map_T::template rt_T< map_T2 >::minus2 minus2;
-    typedef typename map_T::template rt_T< map_T2 >::logic logic;
-    typedef typename map_T::template rt_T< map_T2 >::div div;
-    typedef typename map_T::template rt_T< map_T2 >::div2 div2;
-  };
-}
-
-namespace ac {
-  template<typename T>
-  struct ac_int_represent {
-    enum { t_w = ac_private::c_type_params<T>::W, t_s = ac_private::c_type_params<T>::S };
-    typedef ac_int<t_w,t_s> type;
-  };
-  template<> struct ac_int_represent<float> {};
-  template<> struct ac_int_represent<double> {};
-  template<int W, bool S>
-  struct ac_int_represent< ac_int<W,S> > {
-    typedef ac_int<W,S> type;
-  };
-}
-
-namespace ac_private {
-  template<int W2, bool S2>
-  struct rt_ac_int_T< ac_int<W2,S2> > {
-    typedef ac_int<W2,S2> i2_t;
-    template<int W, bool S>
-    struct op1 {
-      typedef ac_int<W,S> i_t;
-      typedef typename i_t::template rt<W2,S2>::mult mult;
-      typedef typename i_t::template rt<W2,S2>::plus plus;
-      typedef typename i_t::template rt<W2,S2>::minus minus;
-      typedef typename i2_t::template rt<W,S>::minus minus2;
-      typedef typename i_t::template rt<W2,S2>::logic logic;
-      typedef typename i_t::template rt<W2,S2>::div div;
-      typedef typename i2_t::template rt<W,S>::div div2;
-      typedef typename i_t::template rt<W2,S2>::mod mod;
-      typedef typename i2_t::template rt<W,S>::mod mod2;
-    };
-  };
-
-  template<typename T>
-  struct rt_ac_int_T< c_type<T> > {
-    typedef typename ac::ac_int_represent<T>::type i2_t;
-    enum { W2 = i2_t::width, S2 = i2_t::sign };
-    template<int W, bool S>
-    struct op1 {
-      typedef ac_int<W,S> i_t;
-      typedef typename i_t::template rt<W2,S2>::mult mult;
-      typedef typename i_t::template rt<W2,S2>::plus plus;
-      typedef typename i_t::template rt<W2,S2>::minus minus;
-      typedef typename i2_t::template rt<W,S>::minus minus2;
-      typedef typename i_t::template rt<W2,S2>::logic logic;
-      typedef typename i_t::template rt<W2,S2>::div div;
-      typedef typename i2_t::template rt<W,S>::div div2;
-      typedef typename i_t::template rt<W2,S2>::mod mod;
-      typedef typename i2_t::template rt<W,S>::mod mod2;
-    };
-  };
-}
-
-
-// Specializations for constructors on integers that bypass bit adjusting
-//  and are therefore more efficient
-template<> inline ac_int<1,true>::ac_int( bool b ) { v[0] = b ? -1 : 0; }
-
-template<> inline ac_int<1,false>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<1,false>::ac_int( signed char b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned char b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( signed short b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned short b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( signed int b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned int b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( signed long b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned long b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( Ulong b ) { v[0] = (int) b&1; }
-template<> inline ac_int<1,false>::ac_int( Slong b ) { v[0] = (int) b&1; }
-
-template<> inline ac_int<8,true>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<8,false>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<8,true>::ac_int( signed char b ) { v[0] = b; }
-template<> inline ac_int<8,false>::ac_int( unsigned char b ) { v[0] = b; }
-template<> inline ac_int<8,true>::ac_int( unsigned char b ) { v[0] = (signed char) b; }
-template<> inline ac_int<8,false>::ac_int( signed char b ) { v[0] = (unsigned char) b; }
-
-template<> inline ac_int<16,true>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<16,true>::ac_int( signed char b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( unsigned char b ) { v[0] = b; }
-template<> inline ac_int<16,true>::ac_int( unsigned char b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( signed char b ) { v[0] = (unsigned short) b; }
-template<> inline ac_int<16,true>::ac_int( signed short b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( unsigned short b ) { v[0] = b; }
-template<> inline ac_int<16,true>::ac_int( unsigned short b ) { v[0] = (signed short) b; }
-template<> inline ac_int<16,false>::ac_int( signed short b ) { v[0] = (unsigned short) b; }
-
-template<> inline ac_int<32,true>::ac_int( signed int b ) { v[0] = b; }
-template<> inline ac_int<32,true>::ac_int( unsigned int b ) { v[0] = b; }
-template<> inline ac_int<32,false>::ac_int( signed int b ) { v[0] = b; v[1] = 0;}
-template<> inline ac_int<32,false>::ac_int( unsigned int b ) { v[0] = b; v[1] = 0;}
-
-template<> inline ac_int<32,true>::ac_int( Slong b ) { v[0] = (int) b; }
-template<> inline ac_int<32,true>::ac_int( Ulong b ) { v[0] = (int) b; }
-template<> inline ac_int<32,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = 0;}
-template<> inline ac_int<32,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = 0;}
-
-template<> inline ac_int<64,true>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
-template<> inline ac_int<64,true>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
-template<> inline ac_int<64,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
-template<> inline ac_int<64,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
-
-// Stream --------------------------------------------------------------------
-
-template<int W, bool S>
-inline std::ostream& operator << (std::ostream &os, const ac_int<W,S> &x) {
-#ifndef __SYNTHESIS__
-  if ((os.flags() & std::ios::hex) != 0) {
-    os << x.to_string(AC_HEX);
-  } else if ((os.flags() & std::ios::oct) != 0) {
-    os << x.to_string(AC_OCT);
-  } else {
-    os << x.to_string(AC_DEC);
-  }
-#endif
-  return os;
-}
-
-// Macros for Binary Operators with Integers --------------------------------------------
-
-#define BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
-  template<int W, bool S> \
-  inline typename ac_int<WI,SI>::template rt<W,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_int<W,S> &op) {  \
-    return ac_int<WI,SI>(i_op).operator BIN_OP (op);  \
-  } \
-  template<int W, bool S>   \
-  inline typename ac_int<W,S>::template rt<WI,SI>::RTYPE operator BIN_OP ( const ac_int<W,S> &op, C_TYPE i_op) {  \
-    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
-  }
-
-#define REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
-  template<int W, bool S>   \
-  inline bool operator REL_OP ( const ac_int<W,S> &op, C_TYPE op2) {  \
-    return op.operator REL_OP (ac_int<W2,S2>(op2));  \
-  }  \
-  template<int W, bool S> \
-  inline bool operator REL_OP ( C_TYPE op2, const ac_int<W,S> &op) {  \
-    return ac_int<W2,S2>(op2).operator REL_OP (op);  \
-  }
-
-#define ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, W2, S2)  \
-  template<int W, bool S>   \
-  inline ac_int<W,S> &operator ASSIGN_OP ( ac_int<W,S> &op, C_TYPE op2) {  \
-    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
-  }
-
-#define OPS_WITH_INT(C_TYPE, WI, SI) \
-  BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
-  BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
-  BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
-  BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
-  BIN_OP_WITH_INT(%, C_TYPE, WI, SI, mod) \
-  BIN_OP_WITH_INT(>>, C_TYPE, WI, SI, arg1) \
-  BIN_OP_WITH_INT(<<, C_TYPE, WI, SI, arg1) \
-  BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
-  BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
-  BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
-  \
-  REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
-  \
-  ASSIGN_OP_WITH_INT(+=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(-=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(*=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(/=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(%=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(>>=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(<<=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(&=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(|=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(^=, C_TYPE, WI, SI)
-
-// ------------------------------------- End of Macros for Binary Operators with Integers
-
-// for backward compatability with v3.9.0 and earlier define following macro
-#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
-namespace ac {
-  namespace ops_with_other_types {
-#endif
-//  Mixed Operators with Integers  -----------------------------------------------
-OPS_WITH_INT(bool, 1, false)
-OPS_WITH_INT(char, 8, true)
-OPS_WITH_INT(signed char, 8, true)
-OPS_WITH_INT(unsigned char, 8, false)
-OPS_WITH_INT(short, 16, true)
-OPS_WITH_INT(unsigned short, 16, false)
-OPS_WITH_INT(int, 32, true)
-OPS_WITH_INT(unsigned int, 32, false)
-OPS_WITH_INT(long, ac_private::long_w, true)
-OPS_WITH_INT(unsigned long, ac_private::long_w, false)
-OPS_WITH_INT(Slong, 64, true)
-OPS_WITH_INT(Ulong, 64, false)
-// -----------------------------------------  End of Mixed Operators with Integers
-#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
-  }  // ops_with_other_types namespace
-}
-using namespace ac::ops_with_other_types;
-#endif
-
-namespace ac {
-  // Functions to fill bits
-
-  template<typename T>
-  inline T bit_fill_hex(const char *str) {
-    T res;
-    res.bit_fill_hex(str);
-    return res;
-  }
-
-  // returns bit_fill for type
-  //   example:
-  //   ac_int<80,false> x = ac::bit_fill< ac_int<80,false> > ((int [3]) {0xffffa987, 0x6543210f, 0xedcba987 });
-  template<typename T, int N>
-  inline T bit_fill(const int (&ivec)[N], bool bigendian=true) {
-    T res;
-    res.bit_fill(ivec, bigendian);
-    return res;
-  }
-
-}  // ac namespace
-
-//  Mixed Operators with Pointers  -----------------------------------------------
-
-// Addition of ac_int and  pointer
-template<typename T, int W, bool S>
-T *operator +(T *ptr, const ac_int<W,S> &op2) {
-  return ptr + op2.to_int64();
-}
-template<typename T, int W, bool S>
-T *operator +(const ac_int<W,S> &op2, T *ptr) {
-  return ptr + op2.to_int64();
-}
-// Subtraction of ac_int from pointer
-template<typename T, int W, bool S>
-T *operator -(T *ptr, const ac_int<W,S> &op2) {
-  return ptr - op2.to_int64();
-}
-// -----------------------------------------  End of Mixed Operators with Pointers
-
-namespace ac_intN {
-  ///////////////////////////////////////////////////////////////////////////////
-  //  Predefined for ease of use
-  ///////////////////////////////////////////////////////////////////////////////
-  typedef ac_int<1,          true>   int1;
-  typedef ac_int<1,          false>  uint1;
-  typedef ac_int<2,          true>   int2;
-  typedef ac_int<2,          false>  uint2;
-  typedef ac_int<3,          true>   int3;
-  typedef ac_int<3,          false>  uint3;
-  typedef ac_int<4,          true>   int4;
-  typedef ac_int<4,          false>  uint4;
-  typedef ac_int<5,          true>   int5;
-  typedef ac_int<5,          false>  uint5;
-  typedef ac_int<6,          true>   int6;
-  typedef ac_int<6,          false>  uint6;
-  typedef ac_int<7,          true>   int7;
-  typedef ac_int<7,          false>  uint7;
-  typedef ac_int<8,          true>   int8;
-  typedef ac_int<8,          false>  uint8;
-  typedef ac_int<9,          true>   int9;
-  typedef ac_int<9,          false>  uint9;
-  typedef ac_int<10,         true>   int10;
-  typedef ac_int<10,         false>  uint10;
-  typedef ac_int<11,         true>   int11;
-  typedef ac_int<11,         false>  uint11;
-  typedef ac_int<12,         true>   int12;
-  typedef ac_int<12,         false>  uint12;
-  typedef ac_int<13,         true>   int13;
-  typedef ac_int<13,         false>  uint13;
-  typedef ac_int<14,         true>   int14;
-  typedef ac_int<14,         false>  uint14;
-  typedef ac_int<15,         true>   int15;
-  typedef ac_int<15,         false>  uint15;
-  typedef ac_int<16,         true>   int16;
-  typedef ac_int<16,         false>  uint16;
-  typedef ac_int<17,         true>   int17;
-  typedef ac_int<17,         false>  uint17;
-  typedef ac_int<18,         true>   int18;
-  typedef ac_int<18,         false>  uint18;
-  typedef ac_int<19,         true>   int19;
-  typedef ac_int<19,         false>  uint19;
-  typedef ac_int<20,         true>   int20;
-  typedef ac_int<20,         false>  uint20;
-  typedef ac_int<21,         true>   int21;
-  typedef ac_int<21,         false>  uint21;
-  typedef ac_int<22,         true>   int22;
-  typedef ac_int<22,         false>  uint22;
-  typedef ac_int<23,         true>   int23;
-  typedef ac_int<23,         false>  uint23;
-  typedef ac_int<24,         true>   int24;
-  typedef ac_int<24,         false>  uint24;
-  typedef ac_int<25,         true>   int25;
-  typedef ac_int<25,         false>  uint25;
-  typedef ac_int<26,         true>   int26;
-  typedef ac_int<26,         false>  uint26;
-  typedef ac_int<27,         true>   int27;
-  typedef ac_int<27,         false>  uint27;
-  typedef ac_int<28,         true>   int28;
-  typedef ac_int<28,         false>  uint28;
-  typedef ac_int<29,         true>   int29;
-  typedef ac_int<29,         false>  uint29;
-  typedef ac_int<30,         true>   int30;
-  typedef ac_int<30,         false>  uint30;
-  typedef ac_int<31,         true>   int31;
-  typedef ac_int<31,         false>  uint31;
-  typedef ac_int<32,         true>   int32;
-  typedef ac_int<32,         false>  uint32;
-  typedef ac_int<33,         true>   int33;
-  typedef ac_int<33,         false>  uint33;
-  typedef ac_int<34,         true>   int34;
-  typedef ac_int<34,         false>  uint34;
-  typedef ac_int<35,         true>   int35;
-  typedef ac_int<35,         false>  uint35;
-  typedef ac_int<36,         true>   int36;
-  typedef ac_int<36,         false>  uint36;
-  typedef ac_int<37,         true>   int37;
-  typedef ac_int<37,         false>  uint37;
-  typedef ac_int<38,         true>   int38;
-  typedef ac_int<38,         false>  uint38;
-  typedef ac_int<39,         true>   int39;
-  typedef ac_int<39,         false>  uint39;
-  typedef ac_int<40,         true>   int40;
-  typedef ac_int<40,         false>  uint40;
-  typedef ac_int<41,         true>   int41;
-  typedef ac_int<41,         false>  uint41;
-  typedef ac_int<42,         true>   int42;
-  typedef ac_int<42,         false>  uint42;
-  typedef ac_int<43,         true>   int43;
-  typedef ac_int<43,         false>  uint43;
-  typedef ac_int<44,         true>   int44;
-  typedef ac_int<44,         false>  uint44;
-  typedef ac_int<45,         true>   int45;
-  typedef ac_int<45,         false>  uint45;
-  typedef ac_int<46,         true>   int46;
-  typedef ac_int<46,         false>  uint46;
-  typedef ac_int<47,         true>   int47;
-  typedef ac_int<47,         false>  uint47;
-  typedef ac_int<48,         true>   int48;
-  typedef ac_int<48,         false>  uint48;
-  typedef ac_int<49,         true>   int49;
-  typedef ac_int<49,         false>  uint49;
-  typedef ac_int<50,         true>   int50;
-  typedef ac_int<50,         false>  uint50;
-  typedef ac_int<51,         true>   int51;
-  typedef ac_int<51,         false>  uint51;
-  typedef ac_int<52,         true>   int52;
-  typedef ac_int<52,         false>  uint52;
-  typedef ac_int<53,         true>   int53;
-  typedef ac_int<53,         false>  uint53;
-  typedef ac_int<54,         true>   int54;
-  typedef ac_int<54,         false>  uint54;
-  typedef ac_int<55,         true>   int55;
-  typedef ac_int<55,         false>  uint55;
-  typedef ac_int<56,         true>   int56;
-  typedef ac_int<56,         false>  uint56;
-  typedef ac_int<57,         true>   int57;
-  typedef ac_int<57,         false>  uint57;
-  typedef ac_int<58,         true>   int58;
-  typedef ac_int<58,         false>  uint58;
-  typedef ac_int<59,         true>   int59;
-  typedef ac_int<59,         false>  uint59;
-  typedef ac_int<60,         true>   int60;
-  typedef ac_int<60,         false>  uint60;
-  typedef ac_int<61,         true>   int61;
-  typedef ac_int<61,         false>  uint61;
-  typedef ac_int<62,         true>   int62;
-  typedef ac_int<62,         false>  uint62;
-  typedef ac_int<63,         true>   int63;
-  typedef ac_int<63,         false>  uint63;
-}  // namespace ac_intN
-
-#ifndef AC_NOT_USING_INTN
-using namespace ac_intN;
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-
-// Global templatized functions for easy initialization to special values
-template<ac_special_val V, int W, bool S>
-inline ac_int<W,S> value(ac_int<W,S>) {
-  ac_int<W,S> r;
-  return r.template set_val<V>();
-}
-// forward declaration, otherwise GCC errors when calling init_array
-template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>);
-
-#define SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
-template<> inline C_TYPE value<AC_VAL_DC>(C_TYPE) { C_TYPE x; return x; }
-
-// -- C int types -----------------------------------------------------------------
-#define SPECIAL_VAL_FOR_INTS(C_TYPE, WI, SI) \
-template<ac_special_val val> inline C_TYPE value(C_TYPE); \
-template<> inline C_TYPE value<AC_VAL_0>(C_TYPE) { return (C_TYPE)0; } \
-SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
-template<> inline C_TYPE value<AC_VAL_QUANTUM>(C_TYPE) { return (C_TYPE)1; } \
-template<> inline C_TYPE value<AC_VAL_MAX>(C_TYPE) { return (C_TYPE)(SI ? ~(((C_TYPE) 1) << (WI-1)) : (C_TYPE) -1); } \
-template<> inline C_TYPE value<AC_VAL_MIN>(C_TYPE) { return (C_TYPE)(SI ? ((C_TYPE) 1) << (WI-1) : (C_TYPE) 0); }
-
-SPECIAL_VAL_FOR_INTS(bool, 1, false)
-SPECIAL_VAL_FOR_INTS(char, 8, true)
-SPECIAL_VAL_FOR_INTS(signed char, 8, true)
-SPECIAL_VAL_FOR_INTS(unsigned char, 8, false)
-SPECIAL_VAL_FOR_INTS(short, 16, true)
-SPECIAL_VAL_FOR_INTS(unsigned short, 16, false)
-SPECIAL_VAL_FOR_INTS(int, 32, true)
-SPECIAL_VAL_FOR_INTS(unsigned int, 32, false)
-SPECIAL_VAL_FOR_INTS(long, ac_private::long_w, true)
-SPECIAL_VAL_FOR_INTS(unsigned long, ac_private::long_w, false)
-SPECIAL_VAL_FOR_INTS(Slong, 64, true)
-SPECIAL_VAL_FOR_INTS(Ulong, 64, false)
-
-#define INIT_ARRAY_SPECIAL_VAL_FOR_INTS(C_TYPE) \
-  template<ac_special_val V> \
-  inline bool init_array(C_TYPE *a, int n) { \
-    C_TYPE t = value<V>((C_TYPE) 0); \
-    for(int i=0; i < n; i++) \
-      a[i] = t; \
-    return true; \
-  }
-
-namespace ac {
-// PUBLIC FUNCTIONS
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, int W, bool S>
-  inline bool init_array(ac_int<W,S> *a, int n) {
-    ac_int<W,S> t;
-    t.template set_val<V>();
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(bool)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(char)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed char)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned char)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed short)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned short)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed int)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned int)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long long)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long long)
-}
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_INT_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_int.h
+//  Description:    fast arbitrary-length bit-accurate integer types:
+//                    - unsigned integer of length W:  ac_int<W,false>
+//                    - signed integer of length W:  ac_int<W,true>
+//  Author:         Andres Takach, Ph.D.
+//  Notes:
+//   - C++ Runtime: important to use optimization flag (for example -O3)
+//
+//   - Compiler support: recent GNU compilers are required for correct
+//     template compilation
+//
+//   - Most frequent migration issues:
+//      - need to cast to common type when using question mark operator:
+//          (a < 0) ? -a : a;  // a is ac_int<W,true>
+//        change to:
+//          (a < 0) ? -a : (ac_int<W+1,true>) a;
+//        or
+//          (a < 0) ? (ac_int<W+1,false>) -a : (ac_int<W+1,false>) a;
+//
+//      - left shift is not arithmetic ("a<<n" has same bitwidth as "a")
+//          ac_int<W+1,false> b = a << 1;  // a is ac_int<W,false>
+//        is not equivalent to b=2*a. In order to get 2*a behavior change to:
+//          ac_int<W+1,false> b = (ac_int<W+1,false>)a << 1;
+//
+//      - only static length read/write slices are supported:
+//         - read:  x.slc<4>(k) => returns ac_int for 4-bit slice x(4+k-1 DOWNTO k)
+//         - write: x.set_slc(k,y) = writes bits of y to x starting at index k
+*/
+
+#ifndef __AC_INT_H
+#define __AC_INT_H
+
+#define AC_VERSION 3
+#define AC_VERSION_MINOR 9
+
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
+#error GCC version 3 or greater is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
+#error Microsoft Visual Studio 8 or newer is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4127 4100 4244 4307 4310 4365 4514 4554 4706 4800 )
+#endif
+
+// for safety
+#if (defined(N) || defined(N2))
+#error One or more of the following is defined: N, N2. Definition conflicts with their usage as template parameters.
+#error DO NOT use defines before including third party header files.
+#endif
+
+// for safety
+#if (defined(W) || defined(I) || defined(S) || defined(W2) || defined(I2) || defined(S2))
+#error One or more of the following is defined: W, I, S, W2, I2, S2. Definition conflicts with their usage as template parameters.
+#error DO NOT use defines before including third party header files.
+#endif
+
+#if defined(true)
+#warning The C++ keyword true is defined which may result in subtle compilation problems. Undefining it.
+#undef true
+#endif
+#if defined(false)
+#warning The C++ keyword false is defined which may result in subtle compilation problems. Undefining it.
+#undef false
+#endif
+
+#ifndef __ASSERT_H__
+#define __ASSERT_H__
+#include <assert.h>
+#endif
+#include <limits>
+#ifndef AC_USER_DEFINED_ASSERT
+#include <iostream>
+#else
+#include <ostream>
+#endif
+#include <math.h>
+#include <string>
+
+#ifndef __SYNTHESIS__
+#ifndef __AC_INT_UTILITY_BASE
+#define __AC_INT_UTILITY_BASE
+#endif
+
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+#define AC_MAX(a,b) ((a) > (b) ? (a) : (b))
+#define AC_MIN(a,b) ((a) < (b) ? (a) : (b))
+#define AC_ABS(a) ((a) < 0 ? -(a) : (a))
+
+#if defined(_MSC_VER)
+typedef unsigned __int64 Ulong;
+typedef signed   __int64 Slong;
+#else
+typedef unsigned long long Ulong;
+typedef signed   long long Slong;
+#endif
+
+enum ac_base_mode { AC_BIN=2, AC_OCT=8, AC_DEC=10, AC_HEX=16 };
+enum ac_special_val {AC_VAL_DC, AC_VAL_0, AC_VAL_MIN, AC_VAL_MAX, AC_VAL_QUANTUM};
+
+template <int W, bool S> class ac_int;
+
+namespace ac_private {
+#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+#pragma builtin
+#endif
+
+  enum {long_w = std::numeric_limits<unsigned long>::digits};
+  const unsigned int all_ones = (unsigned) ~0;
+
+  // PRIVATE FUNCTIONS in namespace: for implementing ac_int/ac_fixed
+
+#ifndef __SYNTHESIS__
+  inline double mgc_floor(double d) { return floor(d); }
+#else
+  inline double mgc_floor(double d) { return 0.0; }
+#endif
+
+  #define AC_ASSERT(cond, msg) ac_private::ac_assert(cond, __FILE__, __LINE__, msg)
+  inline void ac_assert(bool condition, const char *file=0, int line=0, const char *msg=0) {
+  #ifndef __SYNTHESIS__
+    #ifndef AC_USER_DEFINED_ASSERT
+    if(!condition) {
+      std::cerr << "Assert";
+      if(file)
+        std::cerr << " in file " << file << ":" << line;
+      if(msg)
+        std::cerr << " " << msg;
+      std::cerr << std::endl;
+      assert(0);
+    }
+    #else
+    AC_USER_DEFINED_ASSERT(condition, file, line, msg);
+    #endif
+  #endif
+  }
+
+  // helper structs for statically computing log2 like functions (nbits, log2_floor, log2_ceil)
+  //   using recursive templates
+  template<unsigned char N>
+  struct s_N {
+    template<unsigned X>
+    struct s_X {
+      enum {
+        X2 = X >> N,
+        N_div_2 = N >> 1,
+        nbits = X ? (X2 ? N + (int) s_N<N_div_2>::template s_X<X2>::nbits : (int) s_N<N_div_2>::template s_X<X>::nbits) : 0
+      };
+    };
+  };
+  template<> struct s_N<0> {
+    template<unsigned X>
+    struct s_X {
+      enum {nbits = !!X };
+    };
+  };
+
+  template<int N>
+  inline double ldexpr32(double d) {
+    double d2 = d;
+    if(N < 0)
+      for(int i=0; i < -N; i++)
+        d2 /= (Ulong) 1 << 32;
+    else
+      for(int i=0; i < N; i++)
+        d2 *= (Ulong) 1 << 32;
+    return d2;
+  }
+  template<> inline double ldexpr32<0>(double d) { return d; }
+  template<> inline double ldexpr32<1>(double d) { return d * ((Ulong) 1 << 32); }
+  template<> inline double ldexpr32<-1>(double d) { return d / ((Ulong) 1 << 32); }
+  template<> inline double ldexpr32<2>(double d) { return (d * ((Ulong) 1 << 32)) * ((Ulong) 1 << 32); }
+  template<> inline double ldexpr32<-2>(double d) { return (d / ((Ulong) 1 << 32)) / ((Ulong) 1 << 32); }
+
+  template<int N>
+  inline double ldexpr(double d) {
+    return ldexpr32<N/32>( N < 0 ? d/( (unsigned) 1 << (-N & 31)) : d * ( (unsigned) 1 << (N & 31)));
+  }
+
+  template<int N>
+  inline void iv_copy(const int *op, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op[i];
+  }
+  template<> inline void iv_copy<1>(const int *op, int *r) {
+    r[0] = op[0];
+  }
+  template<> inline void iv_copy<2>(const int *op, int *r) {
+    r[0] = op[0];
+    r[1] = op[1];
+  }
+
+  template<int N>
+  inline bool iv_equal_zero(const int *op){
+    for(int i=0; i < N; i++)
+      if(op[i])
+        return false;
+    return true;
+  }
+  template<> inline bool iv_equal_zero<0>(const int * /*op*/) { return true; }
+  template<> inline bool iv_equal_zero<1>(const int *op) {
+    return !op[0];
+  }
+  template<> inline bool iv_equal_zero<2>(const int *op) {
+    return !(op[0] || op[1]);
+  }
+
+  template<int N>
+  inline bool iv_equal_ones(const int *op){
+    for(int i=0; i < N; i++)
+      if(~op[i])
+        return false;
+    return true;
+  }
+  template<> inline bool iv_equal_ones<0>(const int * /*op*/) { return true; }
+  template<> inline bool iv_equal_ones<1>(const int *op) {
+    return !~op[0];
+  }
+  template<> inline bool iv_equal_ones<2>(const int *op) {
+    return !(~op[0] || ~op[1]);
+  }
+
+  template<int N1, int N2>
+  inline bool iv_equal(const int *op1, const int *op2){
+    const int M1 = AC_MAX(N1,N2);
+    const int M2 = AC_MIN(N1,N2);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+    for(int i=0; i < M2; i++)
+      if(OP1[i] != OP2[i])
+        return false;
+    int ext = OP2[M2-1] < 0 ? ~0 : 0;
+    for(int i=M2; i < M1; i++)
+      if(OP1[i] != ext)
+        return false;
+    return true;
+  }
+  template<> inline bool iv_equal<1,1>(const int *op1, const int *op2) {
+    return op1[0] == op2[0];
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_ones_from(const int *op){
+    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] >> (B&31))))
+      return false;
+    return iv_equal_ones<N-(B+31)/32>(&op[(B+31)/32]);
+  }
+  template<> inline bool  iv_equal_ones_from<0,1>(const int *op){
+    return iv_equal_ones<1>(op);
+  }
+  template<> inline bool  iv_equal_ones_from<0,2>(const int *op){
+    return iv_equal_ones<2>(op);
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_zeros_from(const int *op){
+    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] >> (B&31))))
+      return false;
+    return iv_equal_zero<N-(B+31)/32>(&op[(B+31)/32]);
+  }
+  template<> inline bool  iv_equal_zeros_from<0,1>(const int *op){
+    return iv_equal_zero<1>(op);
+  }
+  template<> inline bool  iv_equal_zeros_from<0,2>(const int *op){
+    return iv_equal_zero<2>(op);
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_ones_to(const int *op){
+    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] | (all_ones << (B&31)))))
+      return false;
+    return iv_equal_ones<B/32>(op);
+  }
+  template<> inline bool  iv_equal_ones_to<0,1>(const int *op){
+    return iv_equal_ones<1>(op);
+  }
+  template<> inline bool  iv_equal_ones_to<0,2>(const int *op){
+    return iv_equal_ones<2>(op);
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_zeros_to(const int *op){
+    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] & ~(all_ones << (B&31)))))
+      return false;
+    return iv_equal_zero<B/32>(op);
+  }
+  template<> inline bool  iv_equal_zeros_to<0,1>(const int *op){
+    return iv_equal_zero<1>(op);
+  }
+  template<> inline bool  iv_equal_zeros_to<0,2>(const int *op){
+    return iv_equal_zero<2>(op);
+  }
+
+  template<int N1, int N2, bool greater>
+  inline bool iv_compare(const int *op1, const int *op2){
+    const int M1 = AC_MAX(N1,N2);
+    const int M2 = AC_MIN(N1,N2);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+    const bool b = (N1 >= N2) == greater;
+    int ext = OP2[M2-1] < 0 ? ~0 : 0;
+    int i2 = M1 > M2 ? ext : OP2[M1-1];
+    if(OP1[M1-1] != i2)
+      return b ^ (OP1[M1-1] < i2);
+    for(int i=M1-2; i >= M2; i--) {
+      if((unsigned) OP1[i] != (unsigned) ext)
+        return b ^ ((unsigned) OP1[i] < (unsigned) ext);
+    }
+    for(int i=M2-1; i >= 0; i--) {
+      if((unsigned) OP1[i] != (unsigned) OP2[i])
+        return b ^ ((unsigned) OP1[i] < (unsigned) OP2[i]);
+    }
+    return false;
+  }
+  template<> inline bool iv_compare<1,1,true>(const int *op1, const int *op2) {
+    return op1[0] > op2[0];
+  }
+  template<> inline bool iv_compare<1,1,false>(const int *op1, const int *op2) {
+    return op1[0] < op2[0];
+  }
+
+  template<int N>
+  inline void iv_extend(int *r, int ext) {
+    for(int i=0; i < N; i++)
+      r[i] = ext;
+  }
+  template<> inline void iv_extend<-2>(int * /*r*/, int /*ext*/) { }
+  template<> inline void iv_extend<-1>(int * /*r*/, int /*ext*/) { }
+  template<> inline void iv_extend<0>(int * /*r*/, int /*ext*/) { }
+  template<> inline void iv_extend<1>(int *r, int ext) {
+    r[0] = ext;
+  }
+  template<> inline void iv_extend<2>(int *r, int ext) {
+    r[0] = ext;
+    r[1] = ext;
+  }
+
+  template<int Nr>
+  inline void iv_assign_int64(int *r, Slong l) {
+    r[0] = (int) l;
+    if(Nr > 1) {
+      r[1] = (int) (l >> 32);
+      iv_extend<Nr-2>(r+2, (r[1] < 0) ? ~0 : 0);
+    }
+  }
+  template<> inline void iv_assign_int64<1>(int *r, Slong l) {
+    r[0] = (int) l;
+  }
+  template<> inline void iv_assign_int64<2>(int *r, Slong l) {
+    r[0] = (int) l;
+    r[1] = (int) (l >> 32);
+  }
+
+  template<int Nr>
+  inline void iv_assign_uint64(int *r, Ulong l) {
+    r[0] = (int) l;
+    if(Nr > 1) {
+      r[1] = (int) (l >> 32);
+      iv_extend<Nr-2>(r+2, 0);
+    }
+  }
+  template<> inline void iv_assign_uint64<1>(int *r, Ulong l) {
+    r[0] = (int) l;
+  }
+  template<> inline void iv_assign_uint64<2>(int *r, Ulong l) {
+    r[0] = (int) l;
+    r[1] = (int) (l >> 32);
+  }
+
+  inline Ulong mult_u_u(int a, int b) {
+    return (Ulong) (unsigned) a * (Ulong) (unsigned) b;
+  }
+  inline Slong mult_u_s(int a, int b) {
+    return (Ulong) (unsigned) a * (Slong) (signed) b;
+  }
+  inline Slong mult_s_u(int a, int b) {
+    return (Slong) (signed) a * (Ulong) (unsigned) b;
+  }
+  inline Slong mult_s_s(int a, int b) {
+    return (Slong) (signed) a * (Slong) (signed) b;
+  }
+  inline void accumulate(Ulong a, Ulong &l1, Slong &l2) {
+    l1 += (Ulong) (unsigned) a;
+    l2 += a >> 32;
+  }
+  inline void accumulate(Slong a, Ulong &l1, Slong &l2) {
+    l1 += (Ulong) (unsigned) a;
+    l2 += a >> 32;
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_mult(const int *op1, const int *op2, int *r) {
+    if(Nr==1)
+      r[0] = op1[0] * op2[0];
+    else if(N1==1 && N2==1)
+      iv_assign_int64<Nr>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
+    else {
+      const int M1 = AC_MAX(N1,N2);
+      const int M2 = AC_MIN(N1,N2);
+      const int *OP1 = N1 >= N2 ? op1 : op2;
+      const int *OP2 = N1 >= N2 ? op2 : op1;
+      const int T1 = AC_MIN(M2-1,Nr);
+      const int T2 = AC_MIN(M1-1,Nr);
+      const int T3 = AC_MIN(M1+M2-2,Nr);
+
+      Ulong l1 = 0;
+      Slong l2 = 0;
+      for(int k=0; k < T1; k++) {
+        for(int i=0; i < k+1; i++)
+          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
+        l2 += (Ulong) (unsigned) (l1 >> 32);
+        r[k] = (int) l1;
+        l1 = (unsigned) l2;
+        l2 >>= 32;
+      }
+      for(int k=T1; k < T2; k++) {
+        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
+        for(int i=0; i < M2-1; i++)
+          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
+        l2 += (Ulong) (unsigned) (l1 >> 32);
+        r[k] = (int) l1;
+        l1 = (unsigned) l2;
+        l2 >>= 32;
+      }
+      for(int k=T2; k < T3; k++) {
+        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
+        for(int i=k-T2+1; i < M2-1; i++)
+          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
+        accumulate(mult_s_u(OP1[M1-1], OP2[k-M1+1]), l1, l2);
+        l2 += (Ulong) (unsigned) (l1 >> 32);
+        r[k] = (int) l1;
+        l1 = (unsigned) l2;
+        l2 >>= 32;
+      }
+      if(Nr >= M1+M2-1) {
+        accumulate(mult_s_s(OP1[M1-1], OP2[M2-1]), l1, l2);
+        r[M1+M2-2] = (int) l1;
+        if(Nr >= M1+M2) {
+          l2 += (Ulong) (unsigned) (l1 >> 32);
+          r[M1+M2-1] = (int) l2;
+          iv_extend<Nr-(M1+M2)>(r+M1+M2, (r[M1+M2-1] < 0) ? ~0 : 0);
+        }
+      }
+    }
+  }
+  template<> inline void iv_mult<1,1,1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] * op2[0];
+  }
+  template<> inline void iv_mult<1,1,2>(const int *op1, const int *op2, int *r) {
+    iv_assign_int64<2>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
+  }
+
+  template<int N>
+  inline bool iv_uadd_carry(const int *op1, bool carry, int *r) {
+    Slong l = carry;
+    for(int i=0; i < N; i++) {
+      l += (Ulong) (unsigned) op1[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    return l != 0;
+  }
+  template<> inline bool iv_uadd_carry<0>(const int * /*op1*/, bool carry, int * /*r*/) { return carry; }
+  template<> inline bool iv_uadd_carry<1>(const int *op1, bool carry, int *r) {
+    Ulong l = carry + (Ulong) (unsigned) op1[0];
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_add_int_carry(const int *op1, int op2, bool carry, int *r) {
+    if(N==0)
+      return carry;
+    if(N==1) {
+      Ulong l = carry + (Slong) op1[0] + (Slong) op2;
+      r[0] = (int) l;
+      return (l >> 32) & 1;
+    }
+    Slong l = carry + (Ulong) (unsigned) op1[0] + (Slong) op2;
+    r[0] = (int) l;
+    l >>= 32;
+    for(int i=1; i < N-1; i++) {
+      l += (Ulong) (unsigned) op1[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    l += (Slong) op1[N-1];
+    r[N-1] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_add_int_carry<0>(const int * /*op1*/, int /*op2*/, bool carry, int * /*r*/) { return carry; }
+  template<> inline bool iv_add_int_carry<1>(const int *op1, int op2, bool carry, int *r) {
+    Ulong l = carry + (Slong) op1[0] + (Slong) op2;
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_uadd_n(const int *op1, const int *op2, int *r) {
+    Ulong l = 0;
+    for(int i=0; i < N; i++) {
+      l += (Ulong)(unsigned) op1[i] + (Ulong)(unsigned) op2[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    return l & 1;
+  }
+  template<> inline bool iv_uadd_n<0>(const int * /*op1*/, const int * /*op2*/, int * /*r*/) { return false; }
+  template<> inline bool iv_uadd_n<1>(const int *op1, const int *op2, int *r) {
+    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_uadd_n<2>(const int *op1, const int *op2, int *r) {
+    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    l >>= 32;
+    l += (Ulong) (unsigned) op1[1] + (Ulong) (unsigned) op2[1];
+    r[1] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_add(const int *op1, const int *op2, int *r) {
+    if(Nr==1)
+      r[0] = op1[0] + op2[0];
+    else {
+      const int M1 = AC_MAX(N1,N2);
+      const int M2 = AC_MIN(N1,N2);
+      const int *OP1 = N1 >= N2 ? op1 : op2;
+      const int *OP2 = N1 >= N2 ? op2 : op1;
+      const int T1 = AC_MIN(M2-1,Nr);
+      const int T2 = AC_MIN(M1,Nr);
+
+      bool carry = iv_uadd_n<T1>(OP1, OP2, r);
+      carry = iv_add_int_carry<T2-T1>(OP1+T1, OP2[T1], carry, r+T1);
+      iv_extend<Nr-T2>(r+T2, carry ? ~0 : 0);
+    }
+  }
+  template<> inline void iv_add<1,1,1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] + op2[0];
+  }
+  template<> inline void iv_add<1,1,2>(const int *op1, const int *op2, int *r) {
+    iv_assign_int64<2>(r, (Slong) op1[0] + (Slong) op2[0]);
+  }
+
+  template<int N>
+  inline bool iv_sub_int_borrow(const int *op1, int op2, bool borrow, int *r) {
+    if(N==1) {
+      Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
+      r[0] = (int) l;
+      return (l >> 32) & 1;
+    }
+    Slong l = (Ulong) (unsigned) op1[0] - (Slong) op2 - borrow;
+    r[0] = (int) l;
+    l >>= 32;
+    for(int i=1; i < N-1; i++) {
+      l += (Ulong) (unsigned) op1[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    l += (Slong) op1[N-1];
+    r[N-1] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_sub_int_borrow<0>(const int * /*op1*/, int /*op2*/, bool borrow, int * /*r*/) { return borrow; }
+  template<> inline bool iv_sub_int_borrow<1>(const int *op1, int op2, bool borrow, int *r) {
+    Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_sub_int_borrow(int op1, const int *op2, bool borrow, int *r) {
+    if(N==1) {
+      Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
+      r[0] = (int) l;
+      return (l >> 32) & 1;
+    }
+    Slong l = (Slong) op1 - (Ulong) (unsigned) op2[0] - borrow;
+    r[0] = (int) l;
+    l >>= 32;
+    for(int i=1; i < N-1; i++) {
+      l -= (Ulong) (unsigned) op2[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    l -= (Slong) op2[N-1];
+    r[N-1] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_sub_int_borrow<0>(int /*op1*/, const int * /*op2*/, bool borrow, int * /*r*/) { return borrow; }
+  template<> inline bool iv_sub_int_borrow<1>(int op1, const int *op2, bool borrow, int *r) {
+    Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_usub_n(const int *op1, const int *op2, int *r) {
+    Slong l = 0;
+    for(int i=0; i < N; i++) {
+      l += (Ulong)(unsigned) op1[i] - (Ulong)(unsigned) op2[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    return l & 1;
+  }
+  template<> inline bool iv_usub_n<1>(const int *op1, const int *op2, int *r) {
+    Ulong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_usub_n<2>(const int *op1, const int *op2, int *r) {
+    Slong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    l >>= 32;
+    l += (Ulong) (unsigned) op1[1] - (Ulong) (unsigned) op2[1];
+    r[1] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_sub(const int *op1, const int *op2, int *r) {
+    if(Nr==1)
+      r[0] = op1[0] - op2[0];
+    else {
+      const int M1 = AC_MAX(N1,N2);
+      const int M2 = AC_MIN(N1,N2);
+      const int T1 = AC_MIN(M2-1,Nr);
+      const int T2 = AC_MIN(M1,Nr);
+      bool borrow = iv_usub_n<T1>(op1, op2, r);
+      if(N1 > N2)
+        borrow = iv_sub_int_borrow<T2-T1>(op1+T1, op2[T1], borrow, r+T1);
+      else
+        borrow = iv_sub_int_borrow<T2-T1>(op1[T1], op2+T1, borrow, r+T1);
+      iv_extend<Nr-T2>(r+T2, borrow ? ~0 : 0);
+    }
+  }
+  template<> inline void iv_sub<1,1,1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] - op2[0];
+  }
+  template<> inline void iv_sub<1,1,2>(const int *op1, const int *op2, int *r) {
+    iv_assign_int64<2>(r, (Slong) op1[0] - (Slong) op2[0]);
+  }
+
+  template<int N>
+  inline bool iv_all_bits_same(const int *op, bool bit) {
+    int t = bit ? ~0 : 0;
+    for(int i=0; i < N; i++)
+      if(op[i] != t)
+        return false;
+    return true;
+  }
+  template<> inline bool iv_all_bits_same<0>(const int * /*op*/, bool /*bit*/) { return true; }
+  template<> inline bool iv_all_bits_same<1>(const int *op, bool bit) {
+    return op[0] == (bit ? ~0 : 0);
+  }
+
+  template <int N, int Nr>
+  void iv_neg(const int *op1, int *r) {
+    Slong l = 0;
+    for(int k = 0; k < AC_MIN(N,Nr); k++) {
+      l -= (Ulong) (unsigned) op1[k];
+      r[k] = (unsigned) l;
+      l >>= 32;
+    }
+    if(Nr > N) {
+      r[N] = (unsigned) (l - (op1[N-1] < 0 ? ~0 : 0));
+      iv_extend<Nr-N-1>(r+N+1, r[N] < 0 ? ~0 : 0);
+    }
+  }
+
+  template <int N, bool S, int Nr>
+  void iv_abs(const int *op1, int *r) {
+    if( S && op1[N-1] < 0) {
+      iv_neg<N,Nr>(op1, r);
+    } else {
+      iv_copy<AC_MIN(N,Nr)>(op1, r);
+      iv_extend<Nr-N>(r+N, 0);
+    }
+  }
+
+  template<int N, int D, int Q, int R, typename sw2, typename uw2, typename sw4, typename uw4, int w1_length>
+  void iv_udiv(const sw2 *n, const sw2 *d, sw2 *q, sw2 *r) {
+    const int w2_length = 2*w1_length;
+    int d_msi;  // most significant int for d
+    for(d_msi = D-1; d_msi > 0 && !d[d_msi]; d_msi--) {}
+    uw4 d1 = 0;
+    if(!d_msi && !d[0]) {
+      d1 = n[0]/d[0];  // d is zero => divide by zero
+      return;
+    }
+    int n_msi;  // most significant int for n
+    for(n_msi = N-1; n_msi > 0 && !n[n_msi]; n_msi--) {}
+    for(int i=0; i < Q; i++)
+      q[i] = 0;
+    for(int i=0; i < R; i++)
+      r[i] = n[i];
+    // write most significant "words" into d1
+    bool d_mss_odd = (bool) (d[d_msi] >> w1_length);
+    int d_mss= 2*d_msi + d_mss_odd;  // index to most significant short (16-bit)
+    d1 = (uw4) (uw2) d[d_msi] << (w1_length << (int) !d_mss_odd);
+    if(d_msi)
+      d1 |= (uw2) d[d_msi-1] >> (d_mss_odd ? w1_length : 0);
+    bool n_mss_odd = (bool) (n[n_msi] >> w1_length);
+    int n_mss = 2*n_msi + n_mss_odd;
+    if(n_mss < d_mss) {
+      // q already initialized to 0
+      if(R) {
+        int r_msi = AC_MIN(R-1, n_msi);
+        for(int j = 0; j <= r_msi; j++)
+          r[j] = n[j];
+        for(int j = r_msi+1; j < R; j++)
+          r[j] = 0;
+      }
+    } else {
+      uw2 r1[N+1];
+      r1[n_msi+1] = 0;
+      for(int k = n_msi; k >= 0; k--)
+        r1[k] = n[k];
+      for(int k = n_mss; k >=d_mss; k--) {
+        int k_msi = k >> 1;
+        bool odd = k & 1;
+        uw2 r1m1 = k_msi > 0 ? r1[k_msi-1] : (uw2) 0;
+        uw4 n1 = odd ?
+          (uw4) ((r1[k_msi+1] << w1_length) | (r1[k_msi] >> w1_length)) << w2_length | ((r1[k_msi] << w1_length) | (r1m1 >> w1_length)) :
+          (uw4) r1[k_msi] << w2_length | r1m1;
+        uw2 q1 = n1/d1;
+        if(q1 >> w1_length)
+          q1--;
+        AC_ASSERT(!(q1 >> w1_length), "Problem detected in long division algorithm, Please report");
+        unsigned k2 = k - d_mss;
+        unsigned k2_i = k2 >> 1;
+        bool odd_2 = k2 & 1;
+        uw2 q2 = q1 << (odd_2 ? w1_length : 0);
+        sw4 l = 0;
+        for(int j = 0; j <= d_msi; j++) {
+          l += r1[k2_i + j];
+          bool l_sign = l < 0;
+          sw4 prod = (uw4) (uw2) d[j] * (uw4) q2;
+          l -= prod;
+          bool ov1 = (l >= 0) & ((prod < 0) | l_sign);
+          bool ov2 = (l < 0) & (prod < 0) & l_sign;
+          r1[k2_i + j] = (uw2) l;
+          l >>= w2_length;
+          if(ov1)
+            l |= ((uw4) -1 << w2_length);
+          if(ov2)
+            l ^= ((sw4) 1 << w2_length);
+        }
+        if(odd_2 | d_mss_odd) {
+          l += r1[k2_i + d_msi + 1];
+          r1[k2_i + d_msi + 1] = (uw2) l;
+        }
+        if(l < 0) {
+          l = 0;
+          for(int j = 0; j <= d_msi; j++) {
+            l += (sw4) (uw2) d[j] << (odd_2 ? w1_length : 0);
+            l += r1[k2_i + j];
+            r1[k2_i + j] = (uw2) l;
+            l >>= w2_length;
+          }
+          if(odd_2 | d_mss_odd)
+            r1[k2_i + d_msi + 1] += (uw2) l;
+          q1--;
+        }
+        if(Q && k2_i < Q) {
+          if(odd_2)
+            q[k2_i] = q1 << w1_length;
+          else
+            q[k2_i] |= q1;
+        }
+      }
+      if(R) {
+        int r_msi = AC_MIN(R-1, n_msi);
+        for(int j = 0; j <= r_msi; j++)
+          r[j] = r1[j];
+        for(int j = r_msi+1; j < R; j++)
+          r[j] = 0;
+      }
+    }
+  }
+
+  template<int N1, int Num_s, int N2, int Den_s, int Nr>
+  inline void iv_div(const int *op1, const int *op2, int *r) {
+    enum { N1_over = N1+(Den_s && (Num_s==2)) };
+    if(N1_over==1 && N2==1) {
+      r[0] = op1[0] / op2[0];
+      iv_extend<Nr-N1>(r+1, ((Num_s || Den_s) && (r[0] < 0)) ? ~0 : 0);
+    }
+    else if(N1_over==1 && N2==2)
+      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(N1_over==2 && N2==1)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / ( (Slong) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / ( (Slong) op2[0]) );
+    else if(N1_over==2 && N2==2)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(!Num_s && !Den_s) {
+      iv_udiv<N1,N2,Nr,0,int,unsigned,Slong,Ulong,16>(op1, op2, r, 0);
+    }
+    else {
+      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
+      int numerator[N1_neg];
+      int denominator[N2_neg];
+      int quotient[N1_neg];
+      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
+      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
+      iv_udiv<N1_neg,N2_neg,N1_neg,0,int,unsigned,Slong,Ulong,16>(numerator, denominator, quotient, 0);
+      if( (Num_s && op1[N1-1] < 0) ^ (Den_s && op2[N2-1] < 0) )
+        iv_neg<N1_neg, Nr>(quotient, r);
+      else {
+        iv_copy<AC_MIN(N1_neg,Nr)>(quotient, r);
+        iv_extend<Nr-N1_neg>(r+N1_neg, (Num_s || Den_s) && r[N1_neg-1] < 0 ? ~0 : 0);
+      }
+    }
+  }
+
+  template<int N1, int Num_s, int N2, int Den_s, int Nr>
+  inline void iv_rem(const int *op1, const int *op2, int *r) {
+    enum { N1_over = N1+(Den_s && (Num_s==2)) };   // N1_over corresponds to the division
+    if(N1_over==1 && N2==1) {
+      r[0] = op1[0] % op2[0];
+      iv_extend<Nr-1>(r+1, Num_s && r[0] < 0 ? ~0 : 0);
+    }
+    else if(N1_over==1 && N2==2)
+      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(N1_over==2 && N2==1)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % ( (Slong) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % ( (Slong) op2[0]) );
+    else if(N1_over==2 && N2==2)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(!Num_s && !Den_s) {
+      iv_udiv<N1,N2,0,Nr,int,unsigned,Slong,Ulong,16>(op1, op2, 0, r);
+    }
+    else {
+      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
+      int numerator[N1_neg];
+      int denominator[N2_neg];
+      int remainder[N2];
+      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
+      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
+      iv_udiv<N1_neg,N2_neg,0,N2,int,unsigned,Slong,Ulong,16>(numerator, denominator, 0, remainder);
+      if( (Num_s && op1[N1-1] < 0) )
+        iv_neg<N2, Nr>(remainder, r);
+      else {
+        iv_copy<AC_MIN(N2,Nr)>(remainder, r);
+        iv_extend<Nr-N2>(r+N2, Num_s && r[N2-1] < 0 ? ~0 : 0);
+      }
+    }
+  }
+
+  template<int N>
+  inline void iv_bitwise_complement_n(const int *op, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = ~op[i];
+  }
+  template<> inline void iv_bitwise_complement_n<1>(const int *op, int *r) {
+    r[0] = ~op[0];
+  }
+  template<> inline void iv_bitwise_complement_n<2>(const int *op, int *r) {
+    r[0] = ~op[0];
+    r[1] = ~op[1];
+  }
+
+  template<int N, int Nr>
+  inline void iv_bitwise_complement(const int *op, int *r) {
+    const int M = AC_MIN(N,Nr);
+    iv_bitwise_complement_n<M>(op, r);
+    iv_extend<Nr-M>(r+M, (r[M-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N>
+  inline void iv_bitwise_and_n(const int *op1, const int *op2, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op1[i] & op2[i];
+  }
+  template<> inline void iv_bitwise_and_n<1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] & op2[0];
+  }
+  template<> inline void iv_bitwise_and_n<2>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] & op2[0];
+    r[1] = op1[1] & op2[1];
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_bitwise_and(const int *op1, const int *op2, int *r) {
+    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
+    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
+    const int *OP1 = N1 > N2 ? op1 : op2;
+    const int *OP2 = N1 > N2 ? op2 : op1;
+
+    iv_bitwise_and_n<M2>(op1, op2, r);
+    if(OP2[M2-1] < 0)
+      iv_copy<M1-M2>(OP1+M2, r+M2);
+    else
+      iv_extend<M1-M2>(r+M2, 0);
+    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N>
+  inline void iv_bitwise_or_n(const int *op1, const int *op2, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op1[i] | op2[i];
+  }
+  template<> inline void iv_bitwise_or_n<1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] | op2[0];
+  }
+  template<> inline void iv_bitwise_or_n<2>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] | op2[0];
+    r[1] = op1[1] | op2[1];
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_bitwise_or(const int *op1, const int *op2, int *r) {
+    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
+    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+
+    iv_bitwise_or_n<M2>(op1, op2, r);
+    if(OP2[M2-1] < 0)
+      iv_extend<M1-M2>(r+M2, ~0);
+    else
+      iv_copy<M1-M2>(OP1+M2, r+M2);
+    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N>
+  inline void iv_bitwise_xor_n(const int *op1, const int *op2, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op1[i] ^ op2[i];
+  }
+  template<> inline void iv_bitwise_xor_n<1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] ^ op2[0];
+  }
+  template<> inline void iv_bitwise_xor_n<2>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] ^ op2[0];
+    r[1] = op1[1] ^ op2[1];
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_bitwise_xor(const int *op1, const int *op2, int *r) {
+    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
+    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+
+    iv_bitwise_xor_n<M2>(op1, op2, r);
+    if(OP2[M2-1] < 0)
+      iv_bitwise_complement_n<M1-M2>(OP1+M2, r+M2);
+    else
+      iv_copy<M1-M2>(OP1+M2, r+M2);
+    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N, int Nr>
+  inline void iv_shift_l(const int *op1, unsigned op2, int *r) {
+    AC_ASSERT(Nr <= N, "iv_shift_l, incorrect usage Nr > N");
+    unsigned s31 = op2 & 31;
+    unsigned ishift = (op2 >> 5) > Nr ? Nr : (op2 >> 5);
+    if(s31 && ishift!=Nr) {
+      unsigned lw = 0;
+      for(unsigned i=0; i < Nr; i++) {
+        unsigned hw = (i >= ishift) ? op1[i-ishift] : 0;
+        r[i] = (hw << s31) | (lw >> (32-s31));
+        lw = hw;
+      }
+    } else {
+      for(unsigned i=0; i < Nr ; i++)
+        r[i] = (i >= ishift) ? op1[i-ishift] : 0;
+    }
+  }
+
+  template<int N, int Nr>
+  inline void iv_shift_r(const int *op1, unsigned op2, int *r) {
+    unsigned s31 = op2 & 31;
+    unsigned ishift = (op2 >> 5) > N ? N : (op2 >> 5);
+    int ext = op1[N-1] < 0 ? ~0 : 0;
+    if(s31 && ishift!=N) {
+      unsigned lw = (ishift < N) ? op1[ishift] : ext;
+      for(unsigned i=0; i < Nr; i++) {
+        unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
+        r[i] = (lw >> s31) | (hw << (32-s31));
+        lw = hw;
+      }
+    } else {
+      for(unsigned i=0; i < Nr ; i++)
+        r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
+    }
+  }
+
+  template<int N, int Nr, bool S>
+  inline void iv_shift_l2(const int *op1, signed op2, int *r) {
+    if(S && op2 < 0)
+      iv_shift_r<N,Nr>(op1, -op2, r);
+    else
+      iv_shift_l<N,Nr>(op1, op2, r);
+  }
+
+  template<> inline void iv_shift_l2<1,1,false>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0;
+  }
+  template<> inline void iv_shift_l2<1,1,true>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 >= 0) ?
+      (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0 :
+      (op2 > -32) ? (op1[0] >> -op2) : (op1[0] >> 31);
+  }
+
+  template<int N, int Nr, bool S>
+  inline void iv_shift_r2(const int *op1, signed op2, int *r) {
+    if(S && op2 < 0)
+      iv_shift_l<N,Nr>(op1, -op2, r);
+    else
+      iv_shift_r<N,Nr>(op1, op2, r);
+  }
+
+  template<> inline void iv_shift_r2<1,1,false>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31);
+  }
+  template<> inline void iv_shift_r2<1,1,true>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 >= 0) ?
+      (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31) :
+      (op2 > -32) ? ( (unsigned) op1[0] << -op2) : 0;
+  }
+
+  template<int N, int Nr, int B>
+  inline void iv_const_shift_l(const int *op1, int *r) {
+    // B >= 0
+    if(!B) {
+      const int M1 = AC_MIN(N,Nr);
+      iv_copy<M1>(op1, r);
+      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
+    }
+    else {
+      const unsigned s31 = B & 31;
+      const int ishift = (((B >> 5) > Nr) ? Nr : (B >> 5));
+      iv_extend<ishift>(r, 0);
+      const int M1 = AC_MIN(N+ishift,Nr);
+      if(s31) {
+        unsigned lw = 0;
+        for(int i=ishift; i < M1; i++) {
+          unsigned hw = op1[i-ishift];
+          r[i] = (hw << s31) | (lw >> ((32-s31)&31));  // &31 is to quiet compilers
+          lw = hw;
+        }
+        if(Nr > M1) {
+          r[M1] = (signed) lw >> ((32-s31)&31);  // &31 is to quiet compilers
+          iv_extend<Nr-M1-1>(r+M1+1, r[M1] < 0 ? ~0 : 0);
+        }
+      } else {
+        for(int i=ishift; i < M1 ; i++)
+          r[i] = op1[i-ishift];
+        iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
+      }
+    }
+  }
+  template<> inline void iv_const_shift_l<1,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+  template<> inline void iv_const_shift_l<2,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+
+  template<int N, int Nr, int B>
+  inline void iv_const_shift_r(const int *op1, int *r) {
+    if(!B) {
+      const int M1 = AC_MIN(N,Nr);
+      iv_copy<M1>(op1, r);
+      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? ~0 : 0);
+    }
+    else {
+      const unsigned s31 = B & 31;
+      const int ishift = (((B >> 5) > N) ? N : (B >> 5));
+      int ext = op1[N-1] < 0 ? ~0 : 0;
+      if(s31 && ishift!=N) {
+        unsigned lw = (ishift < N) ? op1[ishift] : ext;
+        for(int i=0; i < Nr; i++) {
+          unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
+          r[i] = (lw >> s31) | (hw << ((32-s31)&31));  // &31 is to quiet compilers
+          lw = hw;
+        }
+      } else {
+        for(int i=0; i < Nr ; i++)
+          r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
+      }
+    }
+  }
+  template<> inline void iv_const_shift_r<1,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+  template<> inline void iv_const_shift_r<2,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+
+  template<int N>
+  inline void iv_conv_from_fraction(double d, int *r, bool *qb, bool *rbits, bool *o) {
+    bool b = d < 0;
+    double d2 = b ? -d : d;
+    double dfloor = mgc_floor(d2);
+    *o = dfloor != 0.0;
+    d2 = d2 - dfloor;
+    for(int i=N-1; i >=0; i--) {
+      d2 *= (Ulong) 1 << 32;
+      unsigned k = (unsigned int) d2;
+      r[i] = b ? ~k : k;
+      d2 -= k;
+    }
+    d2 *= 2;
+    bool k = ((int) d2) != 0;  // is 0 or 1
+    d2 -= k;
+    *rbits = d2 != 0.0;
+    *qb = (b && *rbits) ^ k;
+    if(b && !*rbits && !*qb)
+      iv_uadd_carry<N>(r, true, r);
+    *o |= b ^ (r[N-1] < 0);
+  }
+
+  template<ac_base_mode b>
+  inline int to_str(int *v, int w, bool left_just, char *r) {
+    const char digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+    const unsigned char B = b==AC_BIN ? 1 : (b==AC_OCT ? 3 : (b==AC_HEX ? 4 : 0));
+    int k = (w+B-1)/B;
+    int n = (w+31) >> 5;
+    int bits = 0;
+    if(b != AC_BIN && left_just) {
+      if( (bits = -(w % B)) )
+        r[--k] = 0;
+    }
+    for(int i = 0; i < n; i++) {
+      if (b != AC_BIN && bits < 0)
+        r[k] += (unsigned char) (( (unsigned) v[i] << (B+bits)) & (b-1));
+      unsigned int m = (unsigned) v[i] >> -bits;
+      for(bits += 32; bits > 0 && k; bits -= B) {
+        r[--k] = (char) (m & (b-1));
+        m >>= B;
+      }
+    }
+    for(int i=0; i < (w+B-1)/B; i++)
+      r[i] = digits[(int)r[i]];
+    return (w+B-1)/B;
+  }
+  template<> inline int to_str<AC_DEC>(int *v, int w, bool left_just, char *r) {
+    int k = 0;
+    int msw = (w-1) >> 5;
+    if(left_just) {
+      unsigned bits_msw = w & 31;
+      if(bits_msw) {
+        unsigned left_shift = 32 - bits_msw;
+        for(int i=msw; i > 0; i--)
+          v[i] = (unsigned) v[i] << left_shift | (unsigned) v[i-1] >> bits_msw;
+        v[0] = (unsigned) v[0] << left_shift;
+      }
+      int lsw = 0;
+      while(lsw < msw || v[msw] ) {
+        Ulong l = 0;
+        for(int i=lsw; i <= msw; i++) {
+          l += (Ulong) (unsigned) v[i] * 10;
+          v[i] = l;
+          l >>= 32;
+          if(i==lsw && !v[i])
+            lsw++;
+        }
+        r[k++] = (char) ('0' + (int) l);
+      }
+    } else {
+      const unsigned d = 1000000000;   // 10E9
+      for(; msw > 0 && !v[msw]; msw--) {}
+      while(msw >= 0) {
+        Ulong nl = 0;
+        for(int i = msw; i >= 0; i--) {
+          nl <<= 32;
+          nl |= (unsigned) v[i];
+          unsigned q = nl/d;
+          nl -= (Ulong) q * d;
+          v[i] = q;
+        }
+        if(!v[msw])
+          msw--;
+        bool last = msw == -1;
+        unsigned rem = (unsigned) nl;
+        for(int i=0; (i < 9 && !last) || rem; i++) {
+          r[k++] = (char) ('0' + (int) (rem % 10));
+          rem /= 10;
+        }
+      }
+      for(int i=0; i < k/2; i++) {
+        char c = r[i];
+        r[i] = r[k-1-i];
+        r[k-1-i] = c;
+      }
+    }
+    r[k] = 0;
+    return k;
+  }
+
+  inline int to_string(int *v, int w, bool sign_mag, ac_base_mode base, bool left_just, char *r) {
+    int n = (w+31) >> 5;
+    bool neg = !sign_mag && v[n-1] < 0;
+    if(!left_just) {
+      while(n-- && v[n] == (neg ? ~0 : 0)) {}
+      int w2 = 32*(n+1);
+      if(w2) {
+        int m = v[n];
+        for(int i = 16; i > 0; i >>= 1) {
+          if((m >> i) == (neg ? ~0 : 0))
+            w2 -= i;
+          else
+            m >>= i;
+        }
+      }
+      if(w2 < w)
+        w = w2;
+      w += !sign_mag;
+    }
+    if(base == AC_DEC)
+      return to_str<AC_DEC>(v, w, left_just, r);
+    else if (base == AC_HEX)
+      return to_str<AC_HEX>(v, w, left_just, r);
+    else if (base == AC_OCT)
+      return to_str<AC_OCT>(v, w, left_just, r);
+    else if (base == AC_BIN)
+      return to_str<AC_BIN>(v, w, left_just, r);
+    return 0;
+  }
+
+  template<int N>
+  inline unsigned iv_leading_bits(const int *op, bool bit);
+
+  template<> inline unsigned iv_leading_bits<1>(const int *op, bool bit) {
+    const unsigned char tab[] = {4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
+    unsigned t = bit ? ~*op : *op;
+    unsigned cnt = 0;
+    if(t >> 16)
+      t >>= 16;
+    else
+      cnt += 16;
+    if(t >> 8)
+      t >>= 8;
+    else
+      cnt += 8;
+    if(t >> 4)
+      t >>= 4;
+    else
+      cnt += 4;
+    cnt += tab[t];
+    return cnt;
+  }
+
+  template<int N>
+  inline unsigned iv_leading_bits(const int *op, bool bit) {
+    int ext_sign = bit ? -1 : 0;
+    int k;
+    for(k = N-1; k >= 0 && op[k] == ext_sign; k--) {}
+    return 32*(N-1-k) + (k < 0 ? 0 : iv_leading_bits<1>(op+k, bit));
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  //  Integer Vector class: iv
+  //////////////////////////////////////////////////////////////////////////////
+  template<int N>
+  class iv {
+  protected:
+    int v[N];
+  public:
+    template<int N2> friend class iv;
+    iv() {}
+    template<int N2>
+    iv ( const iv<N2> &b ) {
+      const int M = AC_MIN(N,N2);
+      iv_copy<M>(b.v, v);
+      iv_extend<N-M>(v+M, (v[M-1] < 0) ? ~0 : 0);
+    }
+    iv ( Slong t) {
+      iv_assign_int64<N>(v, t);
+    }
+    iv ( Ulong t) {
+      iv_assign_uint64<N>(v, t);
+    }
+    iv ( int t) {
+      v[0] = t;
+      iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
+    }
+    iv ( unsigned int t) {
+      v[0] = t;
+      iv_extend<N-1>(v+1, 0);
+    }
+    iv ( long t) {
+      if(long_w == 32) {
+        v[0] = t;
+        iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
+      } else
+        iv_assign_int64<N>(v, t);
+    }
+    iv ( unsigned long t) {
+      if(long_w == 32) {
+        v[0] = t;
+        iv_extend<N-1>(v+1, 0);
+      } else
+        iv_assign_uint64<N>(v, t);
+    }
+    iv ( double d ) {
+      double d2 = ldexpr32<-N>(d);
+      bool qb, rbits, o;
+      iv_conv_from_fraction<N>(d2, v, &qb, &rbits, &o);
+    }
+
+    // Explicit conversion functions to C built-in types -------------
+    inline Slong to_int64() const { return N==1 ? v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
+    inline Ulong to_uint64() const { return N==1 ? (Ulong) v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
+    inline double to_double() const {
+      double a = v[N-1];
+      for(int i=N-2; i >= 0; i--) {
+        a *= (Ulong) 1 << 32;
+        a += (unsigned) v[i];
+      }
+      return a;
+    }
+    inline void conv_from_fraction(double d, bool *qb, bool *rbits, bool *o) {
+      iv_conv_from_fraction<N>(d, v, qb, rbits, o);
+    }
+
+    template<int N2, int Nr>
+    inline void mult(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_mult<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void add(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_add<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void sub(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_sub<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int Num_s, int N2, int Den_s, int Nr>
+    void div(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_div<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
+    }
+    template<int Num_s, int N2, int Den_s, int Nr>
+    void rem(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_rem<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
+    }
+    void increment() {
+      iv_uadd_carry<N>(v, true, v);
+    }
+    void decrement() {
+      iv_sub_int_borrow<N>(v, 0, true, v);
+    }
+    template<int Nr>
+    void neg(iv<Nr> &r) const {
+      iv_neg<N,Nr>(v, r.v);
+    }
+    template<int Nr>
+    void shift_l(unsigned op2, iv<Nr> &r) const {
+      iv_shift_l<N,Nr>(v, op2, r.v);
+    }
+    template<int Nr>
+    void shift_l2(signed op2, iv<Nr> &r) const {
+      iv_shift_l2<N,Nr,true>(v, op2, r.v);
+    }
+    template<int Nr>
+    void shift_r(unsigned op2, iv<Nr> &r) const {
+      iv_shift_r<N,Nr>(v, op2, r.v);
+    }
+    template<int Nr>
+    void shift_r2(signed op2, iv<Nr> &r) const {
+      iv_shift_r2<N,Nr,true>(v, op2, r.v);
+    }
+    template<int Nr, int B>
+    void const_shift_l(iv<Nr> &r) const {
+      iv_const_shift_l<N,Nr,B>(v, r.v);
+    }
+    template<int Nr, int B>
+    void const_shift_r(iv<Nr> &r) const {
+      iv_const_shift_r<N,Nr,B>(v, r.v);
+    }
+    template<int Nr>
+    void bitwise_complement(iv<Nr> &r) const {
+      iv_bitwise_complement<N,Nr>(v, r.v);
+    }
+    template<int N2, int Nr>
+    void bitwise_and(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_bitwise_and<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void bitwise_or(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_bitwise_or<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void bitwise_xor(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_bitwise_xor<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2>
+    bool equal(const iv<N2> &op2) const {
+      return iv_equal<N,N2>(v, op2.v);
+    }
+    template<int N2>
+    bool greater_than(const iv<N2> &op2) const {
+      return iv_compare<N,N2,true>(v, op2.v);
+    }
+    template<int N2>
+    bool less_than(const iv<N2> &op2) const {
+      return iv_compare<N,N2,false>(v, op2.v);
+    }
+    bool equal_zero() const {
+      return iv_equal_zero<N>(v);
+    }
+    template<int N2>
+    void set_slc(unsigned lsb, int WS, const iv<N2> &op2) {
+      AC_ASSERT((31+WS)/32 == N2, "Bad usage: WS greater than length of slice");
+      unsigned msb = lsb+WS-1;
+      unsigned lsb_v = lsb >> 5;
+      unsigned lsb_b = lsb & 31;
+      unsigned msb_v = msb >> 5;
+      unsigned msb_b = msb & 31;
+      if(N2==1) {
+        if(msb_v == lsb_v)
+          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb_b);
+        else {
+          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
+          unsigned m = (((unsigned) op2.v[0] >> 1) >> (31-lsb_b));
+          v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
+        }
+      } else {
+        v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
+        for(int i = 1; i < N2-1; i++)
+          v[lsb_v+i] = ((unsigned) op2.v[i] << lsb_b) | (((unsigned) op2.v[i-1] >> 1) >> (31-lsb_b));
+        unsigned t = ((unsigned) op2.v[N2-1] << lsb_b) | (((unsigned) op2.v[N2-2] >> 1) >> (31-lsb_b));
+        unsigned m;
+        if(msb_v-lsb_v == N2) {
+          v[msb_v-1] = t;
+          m = (((unsigned) op2.v[N2-1] >> 1) >> (31-lsb_b));
+        }
+        else
+          m = t;
+        v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
+      }
+    }
+    unsigned leading_bits(bool bit) const {
+      return iv_leading_bits<N>(v, bit);
+    }
+  };
+
+  template<> inline Slong iv<1>::to_int64() const { return v[0]; }
+  template<> inline Ulong iv<1>::to_uint64() const { return v[0]; }
+
+  template<> inline Slong iv<2>::to_int64() const {
+    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
+  }
+  template<> inline Ulong iv<2>::to_uint64() const {
+    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
+  }
+
+  template<> template<> inline void iv<1>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
+    v[0] ^= (v[0] ^ ((unsigned) op2.v[0] << lsb)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb);
+  }
+  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
+    Ulong l = to_uint64();
+    Ulong l2 = op2.to_uint64();
+    l ^= (l ^ (l2 << lsb)) & (~((~(Ulong)0)<<WS) << lsb);  // WS <= 32
+    *this = l;
+  }
+  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<2> &op2) {
+    Ulong l = to_uint64();
+    Ulong l2 = op2.to_uint64();
+    l ^= (l ^ (l2 << lsb)) & (~(WS==64 ? (Ulong) 0 : ~(Ulong)0<<WS) << lsb);
+    *this = l;
+  }
+
+  // add automatic conversion to Slong/Ulong depending on S and C
+  template<int N, bool S, bool C>
+  class iv_conv : public iv<N> {
+  protected:
+    iv_conv() {}
+    template<class T> iv_conv(const T& t) : iv<N>(t) {}
+  };
+
+  template<int N>
+  class iv_conv<N,false,true> : public iv<N> {
+  public:
+    operator Ulong () const { return iv<N>::to_uint64(); }
+  protected:
+    iv_conv() {}
+    template<class T> iv_conv(const T& t) : iv<N>(t) {}
+  };
+
+  template<int N>
+  class iv_conv<N,true,true> : public iv<N> {
+  public:
+    operator Slong () const { return iv<N>::to_int64(); }
+  protected:
+    iv_conv() {}
+    template<class T> iv_conv(const T& t) : iv<N>(t) {}
+  };
+
+  // Set default to promote to int as this is the case for almost all types
+  //  create exceptions using specializations
+  template<typename T>
+  struct c_prom {
+    typedef int promoted_type;
+  };
+  template<> struct c_prom<unsigned> {
+    typedef unsigned promoted_type;
+  };
+  template<> struct c_prom<long> {
+    typedef long promoted_type;
+  };
+  template<> struct c_prom<unsigned long> {
+    typedef unsigned long promoted_type;
+  };
+  template<> struct c_prom<Slong> {
+    typedef Slong promoted_type;
+  };
+  template<> struct c_prom<Ulong> {
+    typedef Ulong promoted_type;
+  };
+  template<> struct c_prom<float> {
+    typedef float promoted_type;
+  };
+  template<> struct c_prom<double> {
+    typedef double promoted_type;
+  };
+
+  template<typename T, typename T2>
+  struct c_arith {
+     // will error out for pairs of T and T2 that are not defined through specialization
+  };
+  template<typename T> struct c_arith<T,T> {
+    typedef T arith_conv;
+  };
+
+  #define C_ARITH(C_TYPE1, C_TYPE2) \
+  template<> struct c_arith<C_TYPE1, C_TYPE2> { \
+    typedef C_TYPE1 arith_conv; \
+  }; \
+  template<> struct c_arith<C_TYPE2, C_TYPE1> { \
+    typedef C_TYPE1 arith_conv; \
+  };
+
+  C_ARITH(double, float)
+  C_ARITH(double, int)
+  C_ARITH(double, unsigned)
+  C_ARITH(double, long)
+  C_ARITH(double, unsigned long)
+  C_ARITH(double, Slong)
+  C_ARITH(double, Ulong)
+  C_ARITH(float, int)
+  C_ARITH(float, unsigned)
+  C_ARITH(float, long)
+  C_ARITH(float, unsigned long)
+  C_ARITH(float, Slong)
+  C_ARITH(float, Ulong)
+
+  C_ARITH(Slong, int)
+  C_ARITH(Slong, unsigned)
+  C_ARITH(Ulong, int)
+  C_ARITH(Ulong, unsigned)
+
+  template<typename T>
+  struct map {
+    typedef T t;
+  };
+  template<typename T>
+  struct c_type_params {
+    // will error out for T for which this template struct is not specialized
+  };
+
+  template<typename T> inline const char *c_type_name() { return "unknown"; }
+  template<> inline const char *c_type_name<bool>() { return "bool";}
+  template<> inline const char *c_type_name<char>() { return "char";}
+  template<> inline const char *c_type_name<signed char>() { return "signed char";}
+  template<> inline const char *c_type_name<unsigned char>() { return "unsigned char";}
+  template<> inline const char *c_type_name<signed short>() { return "signed short";}
+  template<> inline const char *c_type_name<unsigned short>() { return "unsigned short";}
+  template<> inline const char *c_type_name<int>() { return "int";}
+  template<> inline const char *c_type_name<unsigned>() { return "unsigned";}
+  template<> inline const char *c_type_name<signed long>() { return "signed long";}
+  template<> inline const char *c_type_name<unsigned long>() { return "unsigned long";}
+  template<> inline const char *c_type_name<signed long long>() { return "signed long long";}
+  template<> inline const char *c_type_name<unsigned long long>() { return "unsigned long long";}
+  template<> inline const char *c_type_name<float>() { return "float";}
+  template<> inline const char *c_type_name<double>() { return "double";}
+
+  template<typename T> struct c_type;
+
+  template<typename T>
+  struct rt_c_type_T {
+    template<typename T2>
+    struct op1 {
+      typedef typename T::template rt_T< c_type<T2> >::mult mult;
+      typedef typename T::template rt_T< c_type<T2> >::plus plus;
+      typedef typename T::template rt_T< c_type<T2> >::minus2 minus;
+      typedef typename T::template rt_T< c_type<T2> >::minus minus2;
+      typedef typename T::template rt_T< c_type<T2> >::logic logic;
+      typedef typename T::template rt_T< c_type<T2> >::div2 div;
+      typedef typename T::template rt_T< c_type<T2> >::div div2;
+    };
+  };
+  template<typename T>
+  struct c_type {
+    typedef typename c_prom<T>::promoted_type c_prom_T;
+    struct rt_unary {
+      typedef c_prom_T neg;
+      typedef c_prom_T mag_sqr;
+      typedef c_prom_T mag;
+      template<unsigned N>
+      struct set {
+        typedef c_prom_T sum;
+      };
+    };
+    template<typename T2>
+    struct rt_T {
+      typedef typename rt_c_type_T<T2>::template op1<T>::mult mult;
+      typedef typename rt_c_type_T<T2>::template op1<T>::plus plus;
+      typedef typename rt_c_type_T<T2>::template op1<T>::minus minus;
+      typedef typename rt_c_type_T<T2>::template op1<T>::minus2 minus2;
+      typedef typename rt_c_type_T<T2>::template op1<T>::logic logic;
+      typedef typename rt_c_type_T<T2>::template op1<T>::div div;
+      typedef typename rt_c_type_T<T2>::template op1<T>::div2 div2;
+    };
+    inline static std::string type_name() {
+      std::string r = c_type_name<T>();
+      return r;
+    }
+
+  };
+  // with T == c_type
+  template<typename T>
+  struct rt_c_type_T< c_type<T> > {
+    typedef typename c_prom<T>::promoted_type c_prom_T;
+    template<typename T2>
+    struct op1 {
+      typedef typename c_prom<T2>::promoted_type c_prom_T2;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv mult;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv plus;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus2;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv logic;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div2;
+    };
+  };
+
+  #define C_TYPE_MAP(C_TYPE) \
+  template<> struct map<C_TYPE> { \
+    typedef c_type<C_TYPE> t; \
+  };
+
+  #define C_TYPE_PARAMS(C_TYPE, WI, SI) \
+  template<> struct c_type_params<C_TYPE> { \
+    enum { W = WI, I = WI, E = 0, S = SI, floating_point = 0 }; \
+  };
+
+  #define C_TYPE_MAP_INT(C_TYPE, WI, SI) \
+    C_TYPE_MAP(C_TYPE) \
+    C_TYPE_PARAMS(C_TYPE, WI, SI)
+
+  #define C_TYPE_MAP_FLOAT(C_TYPE, FP, WFP, IFP, EFP) \
+  C_TYPE_MAP(C_TYPE) \
+  template<> struct c_type_params<C_TYPE> { \
+    enum { W = WFP, I = IFP, E = EFP, S = true, floating_point = FP }; \
+  };
+
+  C_TYPE_MAP_INT(bool, 1, false)
+  C_TYPE_MAP_INT(char, 8, true)
+  C_TYPE_MAP_INT(signed char, 8, true)
+  C_TYPE_MAP_INT(unsigned char, 8, false)
+  C_TYPE_MAP_INT(signed short, 16, true)
+  C_TYPE_MAP_INT(unsigned short, 16, false)
+  C_TYPE_MAP_INT(signed int, 32, true)
+  C_TYPE_MAP_INT(unsigned int, 32, false)
+  C_TYPE_MAP_INT(signed long, ac_private::long_w, true)
+  C_TYPE_MAP_INT(unsigned long, ac_private::long_w, false)
+  C_TYPE_MAP_INT(signed long long, 64, true)
+  C_TYPE_MAP_INT(unsigned long long, 64, false)
+  C_TYPE_MAP_FLOAT(float, 1, 25, 1, 8)
+  C_TYPE_MAP_FLOAT(double, 2, 54, 1, 11)
+
+  #undef C_TYPE_INT
+  #undef C_TYPE_PARAMS
+  #undef C_TYPE_FLOAT
+  #undef C_TYPE_MAP
+
+  // specializations for following struct declared/defined after definition of ac_int
+  template<typename T>
+  struct rt_ac_int_T {
+    template<int W, bool S>
+    struct op1 {
+      typedef typename T::template rt_T< ac_int<W,S> >::mult mult;
+      typedef typename T::template rt_T< ac_int<W,S> >::plus plus;
+      typedef typename T::template rt_T< ac_int<W,S> >::minus2 minus;
+      typedef typename T::template rt_T< ac_int<W,S> >::minus minus2;
+      typedef typename T::template rt_T< ac_int<W,S> >::logic logic;
+      typedef typename T::template rt_T< ac_int<W,S> >::div2 div;
+      typedef typename T::template rt_T< ac_int<W,S> >::div div2;
+    };
+  };
+}
+
+namespace ac {
+  // compiler time constant for log2 like functions
+  template<unsigned X>
+  struct nbits {
+    enum { val = X ? ac_private::s_N<16>::s_X<X>::nbits : 1 };
+  };
+
+  template<unsigned X>
+  struct log2_floor {
+    enum { val = nbits<X>::val - 1 };
+  };
+
+  // log2 of 0 is not defined: generate compiler error
+  template<> struct log2_floor<0> {};
+
+  template<unsigned X>
+  struct log2_ceil {
+    enum { lf = log2_floor<X>::val, val = (X == (1 << lf) ? lf : lf+1) };
+  };
+
+  // log2 of 0 is not defined: generate compiler error
+  template<> struct log2_ceil<0> {};
+
+  template<int LowerBound, int UpperBound>
+  struct int_range {
+    enum { l_s = (LowerBound < 0), u_s = (UpperBound < 0),
+           signedness = l_s || u_s,
+           l_nbits = nbits<AC_ABS(LowerBound+l_s)+l_s>::val,
+           u_nbits = nbits<AC_ABS(UpperBound+u_s)+u_s>::val,
+           nbits = AC_MAX(l_nbits, u_nbits + (!u_s && signedness))
+         };
+    typedef ac_int<nbits, signedness> type;
+  };
+
+  template<int W, int P, bool Is_MSB, bool S>
+  class sliceref {
+# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+# pragma builtin
+# endif
+    int *d_iv;
+    template<int W2, int P2, bool Is_MSB2, bool S2> friend class sliceref;
+  public:
+    sliceref( int *iv ) : d_iv(iv) {}
+
+    inline const sliceref operator = ( const sliceref &val ) {
+      return operator=<P,Is_MSB,S>(val);
+    }
+
+    template<int P2, bool Is_MSB2, bool S2>
+    inline const sliceref operator = ( const sliceref<W,P2,Is_MSB2,S2> &val ) {
+      const int src_lsi = P2/32;
+      const int src_msi = (P2+W-1)/32;
+      const int trg_lsi = P/32;
+      const int trg_msi = (P+W-1)/32;
+      const int trg_lsb = P&31;
+      const int trg_msb = (P+W-1)&31;
+      const int N = src_msi-src_lsi+1;
+      const int Nr = trg_msi-trg_lsi+1;
+      const int rshift = (P2&31) - (P&31);
+      int shifted_src[Nr];
+      int *aligned_src = val.d_iv+src_lsi;
+      if(rshift) {
+        if(rshift < 0)
+          ac_private::iv_shift_l<N,Nr>(aligned_src, -rshift, shifted_src);
+        else
+          ac_private::iv_shift_r<N,Nr>(aligned_src, rshift, shifted_src);
+        aligned_src = shifted_src;
+      }
+      unsigned mask_lsi = ac_private::all_ones << trg_lsb;
+      unsigned mask_msi = ac_private::all_ones >> (31-trg_msb);
+      if(Nr==1)
+        mask_lsi &= mask_msi;
+      int *v = d_iv+trg_lsi;
+      v[0] ^= (v[0] ^ ((unsigned) aligned_src[0])) & mask_lsi;
+      for(int k=1; k < Nr-1; k++)
+        v[k] = aligned_src[k];
+      if(Nr > 1)
+        v[Nr-1] ^= (v[Nr-1] ^ ((unsigned) aligned_src[Nr-1])) & mask_msi;
+      if(Is_MSB) {
+        const unsigned rem = 31-trg_msb;
+        if(rem) {
+          v[Nr-1] =  S ? ((signed) ((unsigned) v[Nr-1]  << rem) >> rem)
+                       : ((unsigned) v[Nr-1]  << rem) >> rem;
+        } else if(!S) {
+          v[Nr] = 0;
+        }
+      }
+      return *this;
+    }
+  };
+}
+
+enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
+enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
+template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> class ac_fixed;
+
+//////////////////////////////////////////////////////////////////////////////
+//  Arbitrary-Length Integer: ac_int
+//////////////////////////////////////////////////////////////////////////////
+
+template<int W, bool S=true>
+class ac_int : public ac_private::iv_conv<(W+31+!S)/32, S, W<=64>
+#ifndef __SYNTHESIS__
+__AC_INT_UTILITY_BASE
+#endif
+{
+#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+#pragma builtin
+#endif
+
+  enum {N=(W+31+!S)/32};
+  typedef ac_private::iv_conv<N, S, W <= 64> ConvBase;
+  typedef ac_private::iv<N>                  Base;
+
+  inline void bit_adjust() {
+    const unsigned rem = (32-W)&31;
+    Base::v[N-1] =  S ? ((signed) ((unsigned) Base::v[N-1]  << rem) >> rem) : (rem ?
+                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
+  }
+
+  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
+
+  // returns false if number is denormal
+  template<int WE, bool SE>
+  bool normalize_private(ac_int<WE,SE> &exp, bool reserved_min_exp=false) {
+    int expt = exp;
+    int lshift = leading_sign();
+    bool fully_normalized = true;
+    ac_int<WE, SE> min_exp;
+    min_exp.template set_val<AC_VAL_MIN>();
+    int max_shift = exp - min_exp - reserved_min_exp;
+    if(lshift > max_shift) {
+      lshift = ac_int<WE,false>(max_shift);
+      expt = min_exp + reserved_min_exp;
+      fully_normalized = false;
+    } else {
+      expt -= lshift;
+    }
+    if(Base::equal_zero()) {
+      expt = 0;
+      fully_normalized = true;
+    }
+    exp = expt;
+    Base r;
+    Base::shift_l(lshift, r);
+    Base::operator=(r);
+    bit_adjust();
+    return fully_normalized;
+  }
+
+public:
+  static const int width = W;
+  static const int i_width = W;
+  static const bool sign = S;
+  static const ac_q_mode q_mode = AC_TRN;
+  static const ac_o_mode o_mode = AC_WRAP;
+  static const int e_width = 0;
+
+  template<int W2, bool S2>
+  struct rt {
+    enum {
+      mult_w = W+W2,
+      mult_s = S||S2,
+      plus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
+      plus_s = S||S2,
+      minus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
+      minus_s = true,
+      div_w = W+S2,
+      div_s = S||S2,
+      mod_w = AC_MIN(W,W2+(!S2&&S)),
+      mod_s = S,
+      logic_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2)),
+      logic_s = S||S2
+    };
+    typedef ac_int<mult_w, mult_s> mult;
+    typedef ac_int<plus_w, plus_s> plus;
+    typedef ac_int<minus_w, minus_s> minus;
+    typedef ac_int<logic_w, logic_s> logic;
+    typedef ac_int<div_w, div_s> div;
+    typedef ac_int<mod_w, mod_s> mod;
+    typedef ac_int<W, S> arg1;
+  };
+
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::mult mult;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::plus plus;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus minus;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus2 minus2;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::logic logic;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div div;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div2 div2;
+    typedef ac_int<W, S> arg1;
+  };
+
+  struct rt_unary {
+    enum {
+      neg_w = W+1,
+      neg_s = true,
+      mag_sqr_w = 2*W-S,
+      mag_sqr_s = false,
+      mag_w = W+S,
+      mag_s = false,
+      leading_sign_w = ac::log2_ceil<W+!S>::val,
+      leading_sign_s = false
+    };
+    typedef ac_int<neg_w, neg_s> neg;
+    typedef ac_int<mag_sqr_w, mag_sqr_s> mag_sqr;
+    typedef ac_int<mag_w, mag_s> mag;
+    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
+    template<unsigned N>
+    struct set {
+      enum { sum_w = W + ac::log2_ceil<N>::val, sum_s = S};
+      typedef ac_int<sum_w, sum_s> sum;
+    };
+  };
+
+  template<int W2, bool S2> friend class ac_int;
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
+  ac_int() {
+#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
+    bit_adjust();
+#endif
+  }
+  template<int W2, bool S2>
+  inline ac_int (const ac_int<W2,S2> &op) {
+    Base::operator =(op);
+    bit_adjust();
+  }
+
+  inline ac_int( bool b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( char b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed char b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned char b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed short b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned short b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed int b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned int b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed long b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned long b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( Slong b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( Ulong b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( double d ) : ConvBase(d) { bit_adjust(); }
+
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+  template<ac_special_val V>
+  inline ac_int &set_val() {
+    const unsigned int all_ones = (unsigned) ~0;
+    if(V == AC_VAL_DC) {
+      ac_int r;
+      Base::operator =(r);
+      bit_adjust();
+    }
+    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+      Base::operator =(0);
+      if(S && V == AC_VAL_MIN) {
+        const unsigned int rem = (W-1)&31;
+        Base::v[N-1] = (all_ones << rem);
+      } else if(V == AC_VAL_QUANTUM)
+        Base::v[0] = 1;
+    }
+    else {  // AC_VAL_MAX
+      Base::operator =(-1);
+      const unsigned int rem = (32-W - !S )&31;
+      Base::v[N-1] = (all_ones >> 1) >> rem;
+    }
+    return *this;
+  }
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+  // Explicit conversion functions to C built-in types -------------
+  inline int to_int() const { return Base::v[0]; }
+  inline unsigned to_uint() const { return Base::v[0]; }
+  inline long to_long() const {
+    return ac_private::long_w == 32 ? (long) Base::v[0] : (long) Base::to_int64();
+  }
+  inline unsigned long to_ulong() const {
+    return ac_private::long_w == 32 ? (unsigned long) Base::v[0] : (unsigned long) Base::to_uint64();
+  }
+  inline Slong to_int64() const { return Base::to_int64(); }
+  inline Ulong to_uint64() const { return Base::to_uint64(); }
+  inline double to_double() const { return Base::to_double(); }
+
+  inline int length() const { return W; }
+
+  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
+    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
+    char r[N*32+4] = {0};
+    int i = 0;
+    if(sign_mag)
+      r[i++] = is_neg() ? '-' : '+';
+    else if (base_rep == AC_DEC && is_neg())
+      r[i++] = '-';
+    if(base_rep != AC_DEC) {
+      r[i++] = '0';
+      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
+    }
+    int str_w;
+    if( (base_rep == AC_DEC || sign_mag) && is_neg() ) {
+      ac_int<W, false>  mag = operator -();
+      str_w = ac_private::to_string(mag.v, W+1, sign_mag, base_rep, false, r+i);
+    } else {
+      ac_int<W,S> tmp = *this;
+      str_w = ac_private::to_string(tmp.v, W+!S, sign_mag, base_rep, false, r+i);
+    }
+    if(!str_w) {
+      r[i] = '0';
+      r[i+1] = 0;
+    }
+    return std::string(r);
+  }
+  inline static std::string type_name() {
+    const char *tf[] = {",false>", ",true>"};
+    std::string r = "ac_int<";
+    r += ac_int<32,true>(W).to_string(AC_DEC);
+    r += tf[S];
+    return r;
+  }
+
+  // Arithmetic : Binary ----------------------------------------------------
+  template<int W2, bool S2>
+  typename rt<W2,S2>::mult operator *( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::mult r;
+    Base::mult(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::plus operator +( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::plus r;
+    Base::add(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::minus operator -( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::minus r;
+    Base::sub(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wenum-compare"
+#endif
+  template<int W2, bool S2>
+  typename rt<W2,S2>::div operator /( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::div r;
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::div::N };
+    Base::template div<num_s, N2, den_s, Nr>(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::mod operator %( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::mod r;
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::mod::N };
+    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+  // Arithmetic assign  ------------------------------------------------------
+  template<int W2, bool S2>
+  ac_int &operator *=( const ac_int<W2,S2> &op2) {
+    Base r;
+    Base::mult(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator +=( const ac_int<W2,S2> &op2) {
+    Base r;
+    Base::add(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator -=( const ac_int<W2,S2> &op2) {
+    Base r;
+    Base::sub(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wenum-compare"
+#endif
+  template<int W2, bool S2>
+  ac_int &operator /=( const ac_int<W2,S2> &op2) {
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
+    Base r;
+    Base::template div<num_s, N2, den_s, Nr>(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator %=( const ac_int<W2,S2> &op2) {
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
+    Base r;
+    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+  // Arithmetic prefix increment, decrement ----------------------------------
+  ac_int &operator ++() {
+    Base::increment();
+    bit_adjust();
+    return *this;
+  }
+  ac_int &operator --() {
+    Base::decrement();
+    bit_adjust();
+    return *this;
+  }
+  // Arithmetic postfix increment, decrement ---------------------------------
+  const ac_int operator ++(int) {
+    ac_int t = *this;
+    Base::increment();
+    bit_adjust();
+    return t;
+  }
+  const ac_int operator --(int) {
+    ac_int t = *this;
+    Base::decrement();
+    bit_adjust();
+    return t;
+  }
+  // Arithmetic Unary --------------------------------------------------------
+  ac_int operator +() {
+    return *this;
+  }
+  typename rt_unary::neg operator -() const {
+    typename rt_unary::neg r;
+    Base::neg(r);
+    r.bit_adjust();
+    return r;
+  }
+  // ! ------------------------------------------------------------------------
+  bool operator ! () const {
+    return Base::equal_zero();
+  }
+
+  // Bitwise (arithmetic) unary: complement  -----------------------------
+  ac_int<W+!S, true> operator ~() const {
+    ac_int<W+!S, true> r;
+    Base::bitwise_complement(r);
+    return r;
+  }
+  // Bitwise (non-arithmetic) bit_complement  -----------------------------
+  ac_int<W, false> bit_complement() const {
+    ac_int<W, false> r;
+    Base::bitwise_complement(r);
+    r.bit_adjust();
+    return r;
+  }
+  // Bitwise (arithmetic): and, or, xor ----------------------------------
+  template<int W2, bool S2>
+  typename rt<W2,S2>::logic operator & ( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::logic r;
+    Base::bitwise_and(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::logic operator | ( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::logic r;
+    Base::bitwise_or(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::logic operator ^ ( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::logic r;
+    Base::bitwise_xor(op2, r);
+    return r;
+  }
+  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
+  template<int W2, bool S2>
+  ac_int &operator &= ( const ac_int<W2,S2> &op2 ) {
+    Base r;
+    Base::bitwise_and(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator |= ( const ac_int<W2,S2> &op2 ) {
+    Base r;
+    Base::bitwise_or(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator ^= ( const ac_int<W2,S2> &op2 ) {
+    Base r;
+    Base::bitwise_xor(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  // Shift (result constrained by left operand) -------------------------------
+  template<int W2>
+  ac_int operator << ( const ac_int<W2,true> &op2 ) const {
+    ac_int r;
+    Base::shift_l2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_int operator << ( const ac_int<W2,false> &op2 ) const {
+    ac_int r;
+    Base::shift_l(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_int operator >> ( const ac_int<W2,true> &op2 ) const {
+    ac_int r;
+    Base::shift_r2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_int operator >> ( const ac_int<W2,false> &op2 ) const {
+    ac_int r;
+    Base::shift_r(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  // Shift assign ------------------------------------------------------------
+  template<int W2>
+  ac_int &operator <<= ( const ac_int<W2,true> &op2 ) {
+    Base r;
+    Base::shift_l2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_int &operator <<= ( const ac_int<W2,false> &op2 ) {
+    Base r;
+    Base::shift_l(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_int &operator >>= ( const ac_int<W2,true> &op2 ) {
+    Base r;
+    Base::shift_r2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_int &operator >>= ( const ac_int<W2,false> &op2 ) {
+    Base r;
+    Base::shift_r(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  // Relational ---------------------------------------------------------------
+  template<int W2, bool S2>
+  bool operator == ( const ac_int<W2,S2> &op2) const {
+    return Base::equal(op2);
+  }
+  template<int W2, bool S2>
+  bool operator != ( const ac_int<W2,S2> &op2) const {
+    return !Base::equal(op2);
+  }
+  template<int W2, bool S2>
+  bool operator < ( const ac_int<W2,S2> &op2) const {
+    return Base::less_than(op2);
+  }
+  template<int W2, bool S2>
+  bool operator >= ( const ac_int<W2,S2> &op2) const {
+    return !Base::less_than(op2);
+  }
+  template<int W2, bool S2>
+  bool operator > ( const ac_int<W2,S2> &op2) const {
+    return Base::greater_than(op2);
+  }
+  template<int W2, bool S2>
+  bool operator <= ( const ac_int<W2,S2> &op2) const {
+    return !Base::greater_than(op2);
+  }
+
+  // Bit and Slice Select -----------------------------------------------------
+  template<int WS, int WX, bool SX>
+  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int WS>
+  inline const ac_int<WS,S> slc(signed index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int WS>
+  inline const ac_int<WS,S> slc(unsigned uindex) const {
+    ac_int<WS,S> r;
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int W2, bool S2, int WX, bool SX>
+  inline ac_int &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_int &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_int &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+
+  template<int Msb, int Lsb>
+  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
+    #if __cplusplus > 199711L
+    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
+    static_assert(Lsb >= 0, "LSB is negative");
+    static_assert(Msb < W, "MSB >= W");
+    #endif
+    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
+  }
+
+  class ac_bitref {
+# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+# pragma builtin
+# endif
+    ac_int &d_bv;
+    unsigned d_index;
+  public:
+    ac_bitref( ac_int *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
+    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
+
+    template<int W2, bool S2>
+    operator ac_int<W2,S2> () const { return operator bool (); }
+
+    inline ac_bitref operator = ( int val ) {
+      // lsb of int (val&1) is written to bit
+      if(d_index < W) {
+        int *pval = &d_bv.v[d_index>>5];
+        *pval ^= (*pval ^ ( (unsigned) val << (d_index&31) )) & 1 << (d_index&31);
+        d_bv.bit_adjust();   // in case sign bit was assigned
+      }
+      return *this;
+    }
+    template<int W2, bool S2>
+    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
+      return operator =(val.to_int());
+    }
+    inline ac_bitref operator = ( const ac_bitref &val ) {
+      return operator =((int) (bool) val);
+    }
+  };
+
+  ac_bitref operator [] ( unsigned int uindex) {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  ac_bitref operator [] ( int index) {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  template<int W2, bool S2>
+  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  bool operator [] ( unsigned int uindex) const {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  bool operator [] ( int index) const {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  template<int W2, bool S2>
+  bool operator [] ( const ac_int<W2,S2> &index) const {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+
+  typename rt_unary::leading_sign leading_sign() const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    return ls;
+  }
+  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    all_sign = (ls == W-S);
+    return ls;
+  }
+  // returns false if number is denormal
+  template<int WE, bool SE>
+  bool normalize(ac_int<WE,SE> &exp) {
+    return normalize_private(exp);
+  }
+  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
+  template<int WE, bool SE>
+  bool normalize_RME(ac_int<WE,SE> &exp) {
+    return normalize_private(exp, true);
+  }
+  bool and_reduce() const {
+    return ac_private::iv_equal_ones_to<W,N>(Base::v);
+  }
+  bool or_reduce() const {
+    return !Base::equal_zero();
+  }
+  bool xor_reduce() const {
+    unsigned r = Base::v[N-1];
+    if(S) {
+      const unsigned rem = (32-W)&31;
+      r = (r << rem) >> rem;
+    }
+    if(N > 1)
+      r ^= Base::v[N-2];
+    if(N > 2) {
+      for(int i=0; i<N-2; i++)
+        r ^= Base::v[i];
+    }
+    if(W > 16)
+      r ^= r >> 16;
+    if(W > 8)
+      r ^= r >> 8;
+    if(W > 4)
+      r ^= r >> 4;
+    if(W > 2)
+      r ^= r >> 2;
+    if(W > 1)
+    r ^= r >> 1;
+    return r&1;
+  }
+
+  inline void bit_fill_hex(const char *str) {
+    // Zero Pads if str is too short, throws ms bits away if str is too long
+    // Asserts if anything other than 0-9a-fA-F is encountered
+    ac_int<W,S> res = 0;
+    while(*str) {
+      char c = *str;
+      int h = 0;
+      if(c >= '0' && c <= '9')
+        h = c - '0';
+      else if(c >= 'A' && c <= 'F')
+        h = c - 'A' + 10;
+      else if(c >= 'a' && c <= 'f')
+        h = c - 'a' + 10;
+      else {
+        AC_ASSERT(!c, "Invalid hex digit");
+        break;
+      }
+      res <<= ac_int<3,false>(4);
+      res |= ac_int<4,false>(h);
+      str++;
+    }
+    *this = res;
+  }
+
+  template<int Na>
+  inline void bit_fill(const int (&ivec)[Na], bool bigendian=true) {
+    // bit_fill from integer vector
+    //   if W > N*32, missing most significant bits are zeroed
+    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
+    // Example:
+    //   ac_int<80,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
+    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
+    enum { N0 = (W+31)/32, M = AC_MIN(N0,Na) };
+    ac_int<M*32,false> res = 0;
+    for(int i=0; i < M; i++)
+      res.set_slc(i*32, ac_int<32>(ivec[bigendian ? M-1-i : i]));
+    *this = res;
+  }
+};
+
+namespace ac {
+  template<typename T, typename T2>
+  struct rt_2T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::map<T2>::t map_T2;
+    typedef typename map_T::template rt_T< map_T2 >::mult mult;
+    typedef typename map_T::template rt_T< map_T2 >::plus plus;
+    typedef typename map_T::template rt_T< map_T2 >::minus minus;
+    typedef typename map_T::template rt_T< map_T2 >::minus2 minus2;
+    typedef typename map_T::template rt_T< map_T2 >::logic logic;
+    typedef typename map_T::template rt_T< map_T2 >::div div;
+    typedef typename map_T::template rt_T< map_T2 >::div2 div2;
+  };
+}
+
+namespace ac {
+  template<typename T>
+  struct ac_int_represent {
+    enum { t_w = ac_private::c_type_params<T>::W, t_s = ac_private::c_type_params<T>::S };
+    typedef ac_int<t_w,t_s> type;
+  };
+  template<> struct ac_int_represent<float> {};
+  template<> struct ac_int_represent<double> {};
+  template<int W, bool S>
+  struct ac_int_represent< ac_int<W,S> > {
+    typedef ac_int<W,S> type;
+  };
+}
+
+namespace ac_private {
+  template<int W2, bool S2>
+  struct rt_ac_int_T< ac_int<W2,S2> > {
+    typedef ac_int<W2,S2> i2_t;
+    template<int W, bool S>
+    struct op1 {
+      typedef ac_int<W,S> i_t;
+      typedef typename i_t::template rt<W2,S2>::mult mult;
+      typedef typename i_t::template rt<W2,S2>::plus plus;
+      typedef typename i_t::template rt<W2,S2>::minus minus;
+      typedef typename i2_t::template rt<W,S>::minus minus2;
+      typedef typename i_t::template rt<W2,S2>::logic logic;
+      typedef typename i_t::template rt<W2,S2>::div div;
+      typedef typename i2_t::template rt<W,S>::div div2;
+      typedef typename i_t::template rt<W2,S2>::mod mod;
+      typedef typename i2_t::template rt<W,S>::mod mod2;
+    };
+  };
+
+  template<typename T>
+  struct rt_ac_int_T< c_type<T> > {
+    typedef typename ac::ac_int_represent<T>::type i2_t;
+    enum { W2 = i2_t::width, S2 = i2_t::sign };
+    template<int W, bool S>
+    struct op1 {
+      typedef ac_int<W,S> i_t;
+      typedef typename i_t::template rt<W2,S2>::mult mult;
+      typedef typename i_t::template rt<W2,S2>::plus plus;
+      typedef typename i_t::template rt<W2,S2>::minus minus;
+      typedef typename i2_t::template rt<W,S>::minus minus2;
+      typedef typename i_t::template rt<W2,S2>::logic logic;
+      typedef typename i_t::template rt<W2,S2>::div div;
+      typedef typename i2_t::template rt<W,S>::div div2;
+      typedef typename i_t::template rt<W2,S2>::mod mod;
+      typedef typename i2_t::template rt<W,S>::mod mod2;
+    };
+  };
+}
+
+
+// Specializations for constructors on integers that bypass bit adjusting
+//  and are therefore more efficient
+template<> inline ac_int<1,true>::ac_int( bool b ) { v[0] = b ? -1 : 0; }
+
+template<> inline ac_int<1,false>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<1,false>::ac_int( signed char b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned char b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( signed short b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned short b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( signed int b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned int b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( signed long b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned long b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( Ulong b ) { v[0] = (int) b&1; }
+template<> inline ac_int<1,false>::ac_int( Slong b ) { v[0] = (int) b&1; }
+
+template<> inline ac_int<8,true>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<8,false>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<8,true>::ac_int( signed char b ) { v[0] = b; }
+template<> inline ac_int<8,false>::ac_int( unsigned char b ) { v[0] = b; }
+template<> inline ac_int<8,true>::ac_int( unsigned char b ) { v[0] = (signed char) b; }
+template<> inline ac_int<8,false>::ac_int( signed char b ) { v[0] = (unsigned char) b; }
+
+template<> inline ac_int<16,true>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<16,true>::ac_int( signed char b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( unsigned char b ) { v[0] = b; }
+template<> inline ac_int<16,true>::ac_int( unsigned char b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( signed char b ) { v[0] = (unsigned short) b; }
+template<> inline ac_int<16,true>::ac_int( signed short b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( unsigned short b ) { v[0] = b; }
+template<> inline ac_int<16,true>::ac_int( unsigned short b ) { v[0] = (signed short) b; }
+template<> inline ac_int<16,false>::ac_int( signed short b ) { v[0] = (unsigned short) b; }
+
+template<> inline ac_int<32,true>::ac_int( signed int b ) { v[0] = b; }
+template<> inline ac_int<32,true>::ac_int( unsigned int b ) { v[0] = b; }
+template<> inline ac_int<32,false>::ac_int( signed int b ) { v[0] = b; v[1] = 0;}
+template<> inline ac_int<32,false>::ac_int( unsigned int b ) { v[0] = b; v[1] = 0;}
+
+template<> inline ac_int<32,true>::ac_int( Slong b ) { v[0] = (int) b; }
+template<> inline ac_int<32,true>::ac_int( Ulong b ) { v[0] = (int) b; }
+template<> inline ac_int<32,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = 0;}
+template<> inline ac_int<32,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = 0;}
+
+template<> inline ac_int<64,true>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
+template<> inline ac_int<64,true>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
+template<> inline ac_int<64,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
+template<> inline ac_int<64,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
+
+// Stream --------------------------------------------------------------------
+
+template<int W, bool S>
+inline std::ostream& operator << (std::ostream &os, const ac_int<W,S> &x) {
+#ifndef __SYNTHESIS__
+  if ((os.flags() & std::ios::hex) != 0) {
+    os << x.to_string(AC_HEX);
+  } else if ((os.flags() & std::ios::oct) != 0) {
+    os << x.to_string(AC_OCT);
+  } else {
+    os << x.to_string(AC_DEC);
+  }
+#endif
+  return os;
+}
+
+// Macros for Binary Operators with Integers --------------------------------------------
+
+#define BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
+  template<int W, bool S> \
+  inline typename ac_int<WI,SI>::template rt<W,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_int<W,S> &op) {  \
+    return ac_int<WI,SI>(i_op).operator BIN_OP (op);  \
+  } \
+  template<int W, bool S>   \
+  inline typename ac_int<W,S>::template rt<WI,SI>::RTYPE operator BIN_OP ( const ac_int<W,S> &op, C_TYPE i_op) {  \
+    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
+  }
+
+#define REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
+  template<int W, bool S>   \
+  inline bool operator REL_OP ( const ac_int<W,S> &op, C_TYPE op2) {  \
+    return op.operator REL_OP (ac_int<W2,S2>(op2));  \
+  }  \
+  template<int W, bool S> \
+  inline bool operator REL_OP ( C_TYPE op2, const ac_int<W,S> &op) {  \
+    return ac_int<W2,S2>(op2).operator REL_OP (op);  \
+  }
+
+#define ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, W2, S2)  \
+  template<int W, bool S>   \
+  inline ac_int<W,S> &operator ASSIGN_OP ( ac_int<W,S> &op, C_TYPE op2) {  \
+    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
+  }
+
+#define OPS_WITH_INT(C_TYPE, WI, SI) \
+  BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
+  BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
+  BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
+  BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
+  BIN_OP_WITH_INT(%, C_TYPE, WI, SI, mod) \
+  BIN_OP_WITH_INT(>>, C_TYPE, WI, SI, arg1) \
+  BIN_OP_WITH_INT(<<, C_TYPE, WI, SI, arg1) \
+  BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
+  BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
+  BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
+  \
+  REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
+  \
+  ASSIGN_OP_WITH_INT(+=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(-=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(*=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(/=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(%=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(>>=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(<<=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(&=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(|=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(^=, C_TYPE, WI, SI)
+
+// ------------------------------------- End of Macros for Binary Operators with Integers
+
+// for backward compatability with v3.9.0 and earlier define following macro
+#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
+namespace ac {
+  namespace ops_with_other_types {
+#endif
+//  Mixed Operators with Integers  -----------------------------------------------
+OPS_WITH_INT(bool, 1, false)
+OPS_WITH_INT(char, 8, true)
+OPS_WITH_INT(signed char, 8, true)
+OPS_WITH_INT(unsigned char, 8, false)
+OPS_WITH_INT(short, 16, true)
+OPS_WITH_INT(unsigned short, 16, false)
+OPS_WITH_INT(int, 32, true)
+OPS_WITH_INT(unsigned int, 32, false)
+OPS_WITH_INT(long, ac_private::long_w, true)
+OPS_WITH_INT(unsigned long, ac_private::long_w, false)
+OPS_WITH_INT(Slong, 64, true)
+OPS_WITH_INT(Ulong, 64, false)
+// -----------------------------------------  End of Mixed Operators with Integers
+#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
+  }  // ops_with_other_types namespace
+}
+using namespace ac::ops_with_other_types;
+#endif
+
+namespace ac {
+  // Functions to fill bits
+
+  template<typename T>
+  inline T bit_fill_hex(const char *str) {
+    T res;
+    res.bit_fill_hex(str);
+    return res;
+  }
+
+  // returns bit_fill for type
+  //   example:
+  //   ac_int<80,false> x = ac::bit_fill< ac_int<80,false> > ((int [3]) {0xffffa987, 0x6543210f, 0xedcba987 });
+  template<typename T, int N>
+  inline T bit_fill(const int (&ivec)[N], bool bigendian=true) {
+    T res;
+    res.bit_fill(ivec, bigendian);
+    return res;
+  }
+
+}  // ac namespace
+
+//  Mixed Operators with Pointers  -----------------------------------------------
+
+// Addition of ac_int and  pointer
+template<typename T, int W, bool S>
+T *operator +(T *ptr, const ac_int<W,S> &op2) {
+  return ptr + op2.to_int64();
+}
+template<typename T, int W, bool S>
+T *operator +(const ac_int<W,S> &op2, T *ptr) {
+  return ptr + op2.to_int64();
+}
+// Subtraction of ac_int from pointer
+template<typename T, int W, bool S>
+T *operator -(T *ptr, const ac_int<W,S> &op2) {
+  return ptr - op2.to_int64();
+}
+// -----------------------------------------  End of Mixed Operators with Pointers
+
+namespace ac_intN {
+  ///////////////////////////////////////////////////////////////////////////////
+  //  Predefined for ease of use
+  ///////////////////////////////////////////////////////////////////////////////
+  typedef ac_int<1,          true>   int1;
+  typedef ac_int<1,          false>  uint1;
+  typedef ac_int<2,          true>   int2;
+  typedef ac_int<2,          false>  uint2;
+  typedef ac_int<3,          true>   int3;
+  typedef ac_int<3,          false>  uint3;
+  typedef ac_int<4,          true>   int4;
+  typedef ac_int<4,          false>  uint4;
+  typedef ac_int<5,          true>   int5;
+  typedef ac_int<5,          false>  uint5;
+  typedef ac_int<6,          true>   int6;
+  typedef ac_int<6,          false>  uint6;
+  typedef ac_int<7,          true>   int7;
+  typedef ac_int<7,          false>  uint7;
+  typedef ac_int<8,          true>   int8;
+  typedef ac_int<8,          false>  uint8;
+  typedef ac_int<9,          true>   int9;
+  typedef ac_int<9,          false>  uint9;
+  typedef ac_int<10,         true>   int10;
+  typedef ac_int<10,         false>  uint10;
+  typedef ac_int<11,         true>   int11;
+  typedef ac_int<11,         false>  uint11;
+  typedef ac_int<12,         true>   int12;
+  typedef ac_int<12,         false>  uint12;
+  typedef ac_int<13,         true>   int13;
+  typedef ac_int<13,         false>  uint13;
+  typedef ac_int<14,         true>   int14;
+  typedef ac_int<14,         false>  uint14;
+  typedef ac_int<15,         true>   int15;
+  typedef ac_int<15,         false>  uint15;
+  typedef ac_int<16,         true>   int16;
+  typedef ac_int<16,         false>  uint16;
+  typedef ac_int<17,         true>   int17;
+  typedef ac_int<17,         false>  uint17;
+  typedef ac_int<18,         true>   int18;
+  typedef ac_int<18,         false>  uint18;
+  typedef ac_int<19,         true>   int19;
+  typedef ac_int<19,         false>  uint19;
+  typedef ac_int<20,         true>   int20;
+  typedef ac_int<20,         false>  uint20;
+  typedef ac_int<21,         true>   int21;
+  typedef ac_int<21,         false>  uint21;
+  typedef ac_int<22,         true>   int22;
+  typedef ac_int<22,         false>  uint22;
+  typedef ac_int<23,         true>   int23;
+  typedef ac_int<23,         false>  uint23;
+  typedef ac_int<24,         true>   int24;
+  typedef ac_int<24,         false>  uint24;
+  typedef ac_int<25,         true>   int25;
+  typedef ac_int<25,         false>  uint25;
+  typedef ac_int<26,         true>   int26;
+  typedef ac_int<26,         false>  uint26;
+  typedef ac_int<27,         true>   int27;
+  typedef ac_int<27,         false>  uint27;
+  typedef ac_int<28,         true>   int28;
+  typedef ac_int<28,         false>  uint28;
+  typedef ac_int<29,         true>   int29;
+  typedef ac_int<29,         false>  uint29;
+  typedef ac_int<30,         true>   int30;
+  typedef ac_int<30,         false>  uint30;
+  typedef ac_int<31,         true>   int31;
+  typedef ac_int<31,         false>  uint31;
+  typedef ac_int<32,         true>   int32;
+  typedef ac_int<32,         false>  uint32;
+  typedef ac_int<33,         true>   int33;
+  typedef ac_int<33,         false>  uint33;
+  typedef ac_int<34,         true>   int34;
+  typedef ac_int<34,         false>  uint34;
+  typedef ac_int<35,         true>   int35;
+  typedef ac_int<35,         false>  uint35;
+  typedef ac_int<36,         true>   int36;
+  typedef ac_int<36,         false>  uint36;
+  typedef ac_int<37,         true>   int37;
+  typedef ac_int<37,         false>  uint37;
+  typedef ac_int<38,         true>   int38;
+  typedef ac_int<38,         false>  uint38;
+  typedef ac_int<39,         true>   int39;
+  typedef ac_int<39,         false>  uint39;
+  typedef ac_int<40,         true>   int40;
+  typedef ac_int<40,         false>  uint40;
+  typedef ac_int<41,         true>   int41;
+  typedef ac_int<41,         false>  uint41;
+  typedef ac_int<42,         true>   int42;
+  typedef ac_int<42,         false>  uint42;
+  typedef ac_int<43,         true>   int43;
+  typedef ac_int<43,         false>  uint43;
+  typedef ac_int<44,         true>   int44;
+  typedef ac_int<44,         false>  uint44;
+  typedef ac_int<45,         true>   int45;
+  typedef ac_int<45,         false>  uint45;
+  typedef ac_int<46,         true>   int46;
+  typedef ac_int<46,         false>  uint46;
+  typedef ac_int<47,         true>   int47;
+  typedef ac_int<47,         false>  uint47;
+  typedef ac_int<48,         true>   int48;
+  typedef ac_int<48,         false>  uint48;
+  typedef ac_int<49,         true>   int49;
+  typedef ac_int<49,         false>  uint49;
+  typedef ac_int<50,         true>   int50;
+  typedef ac_int<50,         false>  uint50;
+  typedef ac_int<51,         true>   int51;
+  typedef ac_int<51,         false>  uint51;
+  typedef ac_int<52,         true>   int52;
+  typedef ac_int<52,         false>  uint52;
+  typedef ac_int<53,         true>   int53;
+  typedef ac_int<53,         false>  uint53;
+  typedef ac_int<54,         true>   int54;
+  typedef ac_int<54,         false>  uint54;
+  typedef ac_int<55,         true>   int55;
+  typedef ac_int<55,         false>  uint55;
+  typedef ac_int<56,         true>   int56;
+  typedef ac_int<56,         false>  uint56;
+  typedef ac_int<57,         true>   int57;
+  typedef ac_int<57,         false>  uint57;
+  typedef ac_int<58,         true>   int58;
+  typedef ac_int<58,         false>  uint58;
+  typedef ac_int<59,         true>   int59;
+  typedef ac_int<59,         false>  uint59;
+  typedef ac_int<60,         true>   int60;
+  typedef ac_int<60,         false>  uint60;
+  typedef ac_int<61,         true>   int61;
+  typedef ac_int<61,         false>  uint61;
+  typedef ac_int<62,         true>   int62;
+  typedef ac_int<62,         false>  uint62;
+  typedef ac_int<63,         true>   int63;
+  typedef ac_int<63,         false>  uint63;
+}  // namespace ac_intN
+
+#ifndef AC_NOT_USING_INTN
+using namespace ac_intN;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+
+// Global templatized functions for easy initialization to special values
+template<ac_special_val V, int W, bool S>
+inline ac_int<W,S> value(ac_int<W,S>) {
+  ac_int<W,S> r;
+  return r.template set_val<V>();
+}
+// forward declaration, otherwise GCC errors when calling init_array
+template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>);
+
+#define SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
+template<> inline C_TYPE value<AC_VAL_DC>(C_TYPE) { C_TYPE x; return x; }
+
+// -- C int types -----------------------------------------------------------------
+#define SPECIAL_VAL_FOR_INTS(C_TYPE, WI, SI) \
+template<ac_special_val val> inline C_TYPE value(C_TYPE); \
+template<> inline C_TYPE value<AC_VAL_0>(C_TYPE) { return (C_TYPE)0; } \
+SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
+template<> inline C_TYPE value<AC_VAL_QUANTUM>(C_TYPE) { return (C_TYPE)1; } \
+template<> inline C_TYPE value<AC_VAL_MAX>(C_TYPE) { return (C_TYPE)(SI ? ~(((C_TYPE) 1) << (WI-1)) : (C_TYPE) -1); } \
+template<> inline C_TYPE value<AC_VAL_MIN>(C_TYPE) { return (C_TYPE)(SI ? ((C_TYPE) 1) << (WI-1) : (C_TYPE) 0); }
+
+SPECIAL_VAL_FOR_INTS(bool, 1, false)
+SPECIAL_VAL_FOR_INTS(char, 8, true)
+SPECIAL_VAL_FOR_INTS(signed char, 8, true)
+SPECIAL_VAL_FOR_INTS(unsigned char, 8, false)
+SPECIAL_VAL_FOR_INTS(short, 16, true)
+SPECIAL_VAL_FOR_INTS(unsigned short, 16, false)
+SPECIAL_VAL_FOR_INTS(int, 32, true)
+SPECIAL_VAL_FOR_INTS(unsigned int, 32, false)
+SPECIAL_VAL_FOR_INTS(long, ac_private::long_w, true)
+SPECIAL_VAL_FOR_INTS(unsigned long, ac_private::long_w, false)
+SPECIAL_VAL_FOR_INTS(Slong, 64, true)
+SPECIAL_VAL_FOR_INTS(Ulong, 64, false)
+
+#define INIT_ARRAY_SPECIAL_VAL_FOR_INTS(C_TYPE) \
+  template<ac_special_val V> \
+  inline bool init_array(C_TYPE *a, int n) { \
+    C_TYPE t = value<V>((C_TYPE) 0); \
+    for(int i=0; i < n; i++) \
+      a[i] = t; \
+    return true; \
+  }
+
+namespace ac {
+// PUBLIC FUNCTIONS
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, int W, bool S>
+  inline bool init_array(ac_int<W,S> *a, int n) {
+    ac_int<W,S> t;
+    t.template set_val<V>();
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(bool)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(char)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed char)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned char)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed short)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned short)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed int)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned int)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long long)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long long)
+}
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_INT_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_sc.h b/hls4ml/templates/quartus/ac_types/ac_sc.h
index 01601a5a4a..0921471dc9 100644
--- a/hls4ml/templates/quartus/ac_types/ac_sc.h
+++ b/hls4ml/templates/quartus/ac_types/ac_sc.h
@@ -1,552 +1,552 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2004-2019, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-#ifndef __AC_SC_H
-#define __AC_SC_H
-
-#ifndef __cplusplus
-#error C++ is required to include this header file
-#endif
-
-#if !defined(IEEE_1666_SYSTEMC) && !defined(SYSTEMC_VERSION) && !defined(SC_API_VERSION_STRING)
-#error SystemC header file needs to be included before the ac_sc is included
-#endif
-
-#include <ac_complex.h>
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-// Explicit conversion functions from ac to sc and viceversa
-template <int W>
-ac_int<W, true> to_ac(const sc_dt::sc_bigint<W> &val){
-  enum {N = (W+31)/32 };
-  sc_dt::sc_bigint<N*32> v = val;
-  ac_int<N*32, true> r = 0;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = 0; i < N; i++) {
-    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
-    v >>= 32;
-  }
-  return ac_int<W,true>(r);
-}
-
-template <int W>
-ac_int<W, false> to_ac(const sc_dt::sc_biguint<W> &val){
-  enum {N = (W+31)/32 };
-  sc_dt::sc_biguint<N*32> v = val;
-  ac_int<N*32, true> r = 0;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = 0; i < N; i++) {
-    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
-    v >>= 32;
-  }
-  return ac_int<W,false>(r);
-}
-
-template <int W>
-sc_dt::sc_bigint<W> to_sc(const ac_int<W,true> &val) {
-  enum {N = (W+31)/32 };
-  ac_int<N*32, true> v = val;
-  sc_dt::sc_bigint<N*32> r;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = N-1; i >= 0; i--) {
-    r <<= 32;
-    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
-  }
-  return sc_dt::sc_bigint<W>(r);
-}
-
-template <int W>
-sc_dt::sc_biguint<W> to_sc(const ac_int<W,false> &val) {
-  enum {N = (W+31)/32 };
-  ac_int<N*32, true> v = val;
-  sc_dt::sc_biguint<N*32> r;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = N-1; i >= 0; i--) {
-    r <<= 32;
-    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
-  }
-  return sc_dt::sc_biguint<W>(r);
-}
-
-#ifdef SC_INCLUDE_FX
-template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-ac_fixed<W,I, true> to_ac(const sc_dt::sc_fixed<W,I,Q,O,nbits> &val){
-  ac_fixed<W,I,true> r = 0;
-  sc_dt::sc_fixed<W,W> fv;
-  fv.range(W-1,0) = val.range(W-1,0);
-  sc_dt::sc_bigint<W> v(fv);
-  r.set_slc(0, to_ac(v));
-  return r;
-}
-
-template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-ac_fixed<W,I, false> to_ac(const sc_dt::sc_ufixed<W,I,Q,O,nbits> &val){
-  ac_fixed<W,I,false> r = 0;
-  sc_dt::sc_ufixed<W,W> fv;
-  fv.range(W-1,0) = val.range(W-1,0);
-  sc_dt::sc_biguint<W> v(fv);
-  r.set_slc(0, to_ac(v));
-  return r;
-}
-
-template <int W, int I, ac_q_mode Q, ac_o_mode O>
-sc_dt::sc_fixed<W,I> to_sc(const ac_fixed<W,I,true,Q,O> &val) {
-  ac_int<W,true> v = val.template slc<W>(0);
-  sc_dt::sc_bigint<W> i = to_sc(v);
-  sc_dt::sc_fixed<W,W> f(i);
-  sc_dt::sc_fixed<W,I> r;
-  r.range(W-1,0) = f.range(W-1,0);
-  return r;
-}
-
-template <int W, int I, ac_q_mode Q, ac_o_mode O>
-sc_dt::sc_ufixed<W,I> to_sc(const ac_fixed<W,I,false,Q,O> &val) {
-  ac_int<W,false> v = val.template slc<W>(0);
-  sc_dt::sc_biguint<W> i = to_sc(v);
-  sc_dt::sc_ufixed<W,W> f(i);
-  sc_dt::sc_ufixed<W,I> r;
-  r.range(W-1,0) = f.range(W-1,0);
-  return r;
-}
-#endif
-
-// Utility global functions for initialization
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_int<W> value(sc_dt::sc_int<W>) {
-  sc_dt::sc_int<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_MIN)
-      r[W-1] = 1;
-    else if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX) {
-    r = -1;
-    r[W-1] = 0;
-  }
-  return r;
-}
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_uint<W> value(sc_dt::sc_uint<W>) {
-  sc_dt::sc_uint<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX)
-    r = -1;
-  return r;
-}
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_bigint<W> value(sc_dt::sc_bigint<W>) {
-  sc_dt::sc_bigint<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_MIN)
-      r[W-1] = 1;
-    else if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX) {
-    r = -1;
-    r[W-1] = 0;
-  }
-  return r;
-}
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_biguint<W> value(sc_dt::sc_biguint<W>) {
-  sc_dt::sc_biguint<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX)
-    r = -1;
-  return r;
-}
-
-#ifdef SC_INCLUDE_FX
-template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-inline sc_dt::sc_fixed<W,I,Q,O,nbits> value(sc_dt::sc_fixed<W,I,Q,O,nbits>) {
-  sc_dt::sc_fixed<W,I> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_MIN)
-      r[W-1] = 1;
-    else if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX) {
-    r = ~ (sc_dt::sc_fixed<W,I>) 0;
-    r[W-1] = 0;
-  }
-  return r;
-}
-
-template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-inline sc_dt::sc_ufixed<W,I,Q,O,nbits> value(sc_dt::sc_ufixed<W,I,Q,O,nbits>) {
-  sc_dt::sc_ufixed<W,I> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX)
-    r = ~ (sc_dt::sc_ufixed<W,I>) 0;
-  return r;
-}
-#endif
-
-
-namespace ac {
-// PUBLIC FUNCTIONS
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_int<W> *a, int n) {
-    sc_dt::sc_int<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_uint<W> *a, int n) {
-    sc_dt::sc_uint<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_bigint<W> *a, int n) {
-    sc_dt::sc_bigint<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_biguint<W> *a, int n) {
-    sc_dt::sc_biguint<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-#ifdef SC_INCLUDE_FX
-  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-  inline bool init_array(sc_dt::sc_fixed<W,I,Q,O,nbits> *a, int n) {
-    sc_dt::sc_fixed<W,I> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-  inline bool init_array(sc_dt::sc_ufixed<W,I,Q,O,nbits> *a, int n) {
-    sc_dt::sc_ufixed<W,I> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-#endif
-}
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-
-// TRACE FUNCTIONS
-
-// SystemC Versions - 2.2.0 20070314
-//                    2.3.0 20120701
-//                    2.3.1 20140417
-//                    2.3.2 20171012
-
-#if !defined(NCSC)
-#if (SYSTEMC_VERSION >= 20140417) && !defined(SC_TRACE_FILE_BASE_H_INCLUDED_)
-namespace sc_core {
-class vcd_trace;
-class sc_trace_file_base
-  : public sc_trace_file
-{
-public:
-    enum vcd_enum {VCD_WIRE=0, VCD_REAL, VCD_EVENT, VCD_TIME, VCD_LAST};
-    virtual void do_initialize() = 0;
-    FILE* fp;
-#if (SYSTEMC_VERSION >= 20171012)
-    sc_time::value_type trace_unit_fs, kernel_unit_fs;
-#else
-    double timescale_unit;
-#endif
-    bool        timescale_set_by_user;
-    std::string filename_;
-    bool        initialized_;
-    bool        trace_delta_cycles_;
-    virtual ~sc_trace_file_base();
-};
-class vcd_trace_file
-  : public sc_trace_file_base
-{
-public:
-    ~vcd_trace_file();
-    std::string obtain_name();
-    virtual void do_initialize();
-    unsigned vcd_name_index;
-#if (SYSTEMC_VERSION >= 20171012)
-    sc_time::value_type previous_time_units_low, previous_time_units_high;
-#else
-    unsigned previous_time_units_low, previous_time_units_high;
-#endif
-    std::vector<vcd_trace*> traces;
-};
-}
-#endif
-
-namespace sc_core {
-//==============================================================================
-// The following block of code is copied from the file sc_vcd_trace.cpp in the
-// SystemC distribution. This code should have been placed in the file
-// sc_vcd_trace.h to allow proper C++ derivation.
-class vcd_trace
-{
-public:
-    vcd_trace(const std::string& name_, const std::string& vcd_name_);
-    virtual void write(FILE* f) = 0;
-    virtual void set_width();
-    virtual bool changed() = 0;
-#if (SYSTEMC_VERSION >= 20171012)
-    virtual void print_variable_declaration_line(FILE* f, const char* scoped_name);
-#else
-    virtual void print_variable_declaration_line(FILE* f);
-#endif
-    void compose_data_line(char* rawdata, char* compdata);
-
-#if (SYSTEMC_VERSION >= 20140417)
-    std::string compose_line(const std::string& data);
-#else
-    std::string compose_line(const std::string data);
-#endif
-    virtual ~vcd_trace();
-    const std::string name;
-    const std::string vcd_name;
-#if (SYSTEMC_VERSION >= 20171012)
-    vcd_trace_file::vcd_enum vcd_var_type;
-#else
-    const char* vcd_var_typ_name;
-#endif
-    int bit_width;
-};
-}
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-namespace ac_tracing {
-
-//==============================================================================
-// TRACING SUPPORT FOR AC_INT
-template <int W, bool S>
-class vcd_ac_int_trace : public sc_core::vcd_trace
-{
-public:
-  vcd_ac_int_trace(const ac_int<W,S> &object_, const std::string& name_, const std::string& vcd_name_) :
-    vcd_trace(name_, vcd_name_), object(object_)
-  {
-#if (SYSTEMC_VERSION >= 20171012)
-    vcd_var_type = sc_core::vcd_trace_file::VCD_WIRE;
-#else
-    vcd_var_typ_name = "wire"; // SystemC does not expose vcd_types[] in sc_vcd_trace.h
-#endif
-    bit_width = W; // bit_width defined in base class 'vcd_trace'
-  }
-
-  virtual void write(FILE* f) {
-    // The function to_string(AC_BIN) returns a string with the zero-radix prefix (i.e. "0b").
-    // Strip that prefix off because compose_line will add its own.
-    std::fprintf(f, "%s", compose_line(((ac_int<W,false>)object).to_string(AC_BIN,true).substr(3)).c_str());
-    old_value = object;
-  }
-
-  virtual void set_width() { bit_width = W; }
-
-  // Comparison function needs to be pure virtual too
-  virtual bool changed() { return !(object == old_value); }
-
-  virtual ~vcd_ac_int_trace() {}
-protected:
-  const ac_int<W,S> &object;
-  ac_int<W,S>        old_value;
-};
-
-template <int W, bool S>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_int<W,S> &a, const std::string &name)
-{
-  using namespace sc_core;
-  if (tf) {
-    vcd_trace *t = (vcd_trace*) new vcd_ac_int_trace<W,S>(a,name,((vcd_trace_file*)tf)->obtain_name());
-    ((vcd_trace_file*)tf)->traces.push_back(t);
-  }
-}
-//==============================================================================
-
-#if !defined(__AC_FIXED_MTI_H)
-// The ac_fixed.h shipped with ModelSim/QuestaSim has a stub for sc_trace() for ac_fixed so
-// this code is not used. The stub should be removed in a future release of the simulator.
-#if defined(__AC_FIXED_H) && !defined(SC_TRACE_AC_FIXED)
-#define SC_TRACE_AC_FIXED
-//==============================================================================
-// TRACING SUPPORT FOR AC_FIXED
-template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_fixed<W,I,S,Q,O> &a, const std::string &name)
-{
-  const int iv_N = (W+31+!S)/32;
-  typedef typename ac_private::template iv<iv_N> CommonBase_t;
-  sc_trace(tf, *(const ac_int<W,S>*)(const CommonBase_t*) &a, name);
-}
-//==============================================================================
-#endif
-#endif
-
-#if defined(__AC_FLOAT_H) && !defined(SC_TRACE_AC_FLOAT)
-#define SC_TRACE_AC_FLOAT
-//==============================================================================
-// TRACING SUPPORT FOR AC_FLOAT
-template<int W, int I, int E, ac_q_mode Q>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_float<W,I,E,Q> &a, const std::string &name)
-{
-  sc_trace(tf, a.m, name + ".m");
-  sc_trace(tf, a.e, name + ".e");
-}
-//==============================================================================
-#endif
-
-#if defined(__AC_STD_FLOAT_H) && !defined(SC_TRACE_AC_STD_FLOAT)
-#define SC_TRACE_AC_STD_FLOAT
-//==============================================================================
-// TRACING SUPPORT FOR AC_STD_FLOAT
-template<int W, int E>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_std_float<W,E> &a, const std::string &name)
-{
-  sc_trace(tf, a.data(), name + ".d");
-}
-//==============================================================================
-//==============================================================================
-// TRACING SUPPORT FOR AC_IEEE_FLOAT
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary16> &a, const std::string &name)
-{
-  sc_trace(tf, a.data(), name + ".d");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary32> &a, const std::string &name)
-{
-  sc_trace(tf, *(const int*) &a.data(), name + ".d");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary64> &a, const std::string &name)
-{
-  sc_trace(tf, *(const long long*) &a.data(), name + ".d");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary128> &a, const std::string &name)
-{
-  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
-  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary256> &a, const std::string &name)
-{
-  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
-  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
-  sc_trace(tf, ((const long long*) &a.data())[2], name + ".d2");
-  sc_trace(tf, ((const long long*) &a.data())[3], name + ".d3");
-}
-// TRACING SUPPORT FOR AC::BFLOAT16
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac::bfloat16 &a, const std::string &name)
-{
-  sc_trace(tf, a.data(), name + ".d");
-}
-//==============================================================================
-#endif
-
-#if defined(__AC_COMPLEX_H) && !defined(SC_TRACE_AC_COMPLEX)
-#define SC_TRACE_AC_COMPLEX
-//==============================================================================
-// TRACING SUPPORT FOR AC_COMPLEX
-template<typename T>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_complex<T> &a, const std::string &name)
-{
-  sc_trace(tf, a.real(), name + ".r");
-  sc_trace(tf, a.imag(), name + ".i");
-}
-#endif
-
-}  // namespace ac_tracing
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-namespace sc_core {
-#ifdef __AC_NAMESPACE
-  using __AC_NAMESPACE::ac_tracing::sc_trace;
-#else
-  using ac_tracing::sc_trace;
-#endif
-}
-
-#endif
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2004-2019, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+#ifndef __AC_SC_H
+#define __AC_SC_H
+
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#endif
+
+#if !defined(IEEE_1666_SYSTEMC) && !defined(SYSTEMC_VERSION) && !defined(SC_API_VERSION_STRING)
+#error SystemC header file needs to be included before the ac_sc is included
+#endif
+
+#include <ac_complex.h>
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+// Explicit conversion functions from ac to sc and viceversa
+template <int W>
+ac_int<W, true> to_ac(const sc_dt::sc_bigint<W> &val){
+  enum {N = (W+31)/32 };
+  sc_dt::sc_bigint<N*32> v = val;
+  ac_int<N*32, true> r = 0;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = 0; i < N; i++) {
+    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
+    v >>= 32;
+  }
+  return ac_int<W,true>(r);
+}
+
+template <int W>
+ac_int<W, false> to_ac(const sc_dt::sc_biguint<W> &val){
+  enum {N = (W+31)/32 };
+  sc_dt::sc_biguint<N*32> v = val;
+  ac_int<N*32, true> r = 0;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = 0; i < N; i++) {
+    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
+    v >>= 32;
+  }
+  return ac_int<W,false>(r);
+}
+
+template <int W>
+sc_dt::sc_bigint<W> to_sc(const ac_int<W,true> &val) {
+  enum {N = (W+31)/32 };
+  ac_int<N*32, true> v = val;
+  sc_dt::sc_bigint<N*32> r;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = N-1; i >= 0; i--) {
+    r <<= 32;
+    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
+  }
+  return sc_dt::sc_bigint<W>(r);
+}
+
+template <int W>
+sc_dt::sc_biguint<W> to_sc(const ac_int<W,false> &val) {
+  enum {N = (W+31)/32 };
+  ac_int<N*32, true> v = val;
+  sc_dt::sc_biguint<N*32> r;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = N-1; i >= 0; i--) {
+    r <<= 32;
+    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
+  }
+  return sc_dt::sc_biguint<W>(r);
+}
+
+#ifdef SC_INCLUDE_FX
+template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+ac_fixed<W,I, true> to_ac(const sc_dt::sc_fixed<W,I,Q,O,nbits> &val){
+  ac_fixed<W,I,true> r = 0;
+  sc_dt::sc_fixed<W,W> fv;
+  fv.range(W-1,0) = val.range(W-1,0);
+  sc_dt::sc_bigint<W> v(fv);
+  r.set_slc(0, to_ac(v));
+  return r;
+}
+
+template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+ac_fixed<W,I, false> to_ac(const sc_dt::sc_ufixed<W,I,Q,O,nbits> &val){
+  ac_fixed<W,I,false> r = 0;
+  sc_dt::sc_ufixed<W,W> fv;
+  fv.range(W-1,0) = val.range(W-1,0);
+  sc_dt::sc_biguint<W> v(fv);
+  r.set_slc(0, to_ac(v));
+  return r;
+}
+
+template <int W, int I, ac_q_mode Q, ac_o_mode O>
+sc_dt::sc_fixed<W,I> to_sc(const ac_fixed<W,I,true,Q,O> &val) {
+  ac_int<W,true> v = val.template slc<W>(0);
+  sc_dt::sc_bigint<W> i = to_sc(v);
+  sc_dt::sc_fixed<W,W> f(i);
+  sc_dt::sc_fixed<W,I> r;
+  r.range(W-1,0) = f.range(W-1,0);
+  return r;
+}
+
+template <int W, int I, ac_q_mode Q, ac_o_mode O>
+sc_dt::sc_ufixed<W,I> to_sc(const ac_fixed<W,I,false,Q,O> &val) {
+  ac_int<W,false> v = val.template slc<W>(0);
+  sc_dt::sc_biguint<W> i = to_sc(v);
+  sc_dt::sc_ufixed<W,W> f(i);
+  sc_dt::sc_ufixed<W,I> r;
+  r.range(W-1,0) = f.range(W-1,0);
+  return r;
+}
+#endif
+
+// Utility global functions for initialization
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_int<W> value(sc_dt::sc_int<W>) {
+  sc_dt::sc_int<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_MIN)
+      r[W-1] = 1;
+    else if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX) {
+    r = -1;
+    r[W-1] = 0;
+  }
+  return r;
+}
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_uint<W> value(sc_dt::sc_uint<W>) {
+  sc_dt::sc_uint<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX)
+    r = -1;
+  return r;
+}
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_bigint<W> value(sc_dt::sc_bigint<W>) {
+  sc_dt::sc_bigint<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_MIN)
+      r[W-1] = 1;
+    else if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX) {
+    r = -1;
+    r[W-1] = 0;
+  }
+  return r;
+}
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_biguint<W> value(sc_dt::sc_biguint<W>) {
+  sc_dt::sc_biguint<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX)
+    r = -1;
+  return r;
+}
+
+#ifdef SC_INCLUDE_FX
+template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+inline sc_dt::sc_fixed<W,I,Q,O,nbits> value(sc_dt::sc_fixed<W,I,Q,O,nbits>) {
+  sc_dt::sc_fixed<W,I> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_MIN)
+      r[W-1] = 1;
+    else if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX) {
+    r = ~ (sc_dt::sc_fixed<W,I>) 0;
+    r[W-1] = 0;
+  }
+  return r;
+}
+
+template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+inline sc_dt::sc_ufixed<W,I,Q,O,nbits> value(sc_dt::sc_ufixed<W,I,Q,O,nbits>) {
+  sc_dt::sc_ufixed<W,I> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX)
+    r = ~ (sc_dt::sc_ufixed<W,I>) 0;
+  return r;
+}
+#endif
+
+
+namespace ac {
+// PUBLIC FUNCTIONS
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_int<W> *a, int n) {
+    sc_dt::sc_int<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_uint<W> *a, int n) {
+    sc_dt::sc_uint<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_bigint<W> *a, int n) {
+    sc_dt::sc_bigint<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_biguint<W> *a, int n) {
+    sc_dt::sc_biguint<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+#ifdef SC_INCLUDE_FX
+  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+  inline bool init_array(sc_dt::sc_fixed<W,I,Q,O,nbits> *a, int n) {
+    sc_dt::sc_fixed<W,I> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+  inline bool init_array(sc_dt::sc_ufixed<W,I,Q,O,nbits> *a, int n) {
+    sc_dt::sc_ufixed<W,I> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+#endif
+}
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+
+// TRACE FUNCTIONS
+
+// SystemC Versions - 2.2.0 20070314
+//                    2.3.0 20120701
+//                    2.3.1 20140417
+//                    2.3.2 20171012
+
+#if !defined(NCSC)
+#if (SYSTEMC_VERSION >= 20140417) && !defined(SC_TRACE_FILE_BASE_H_INCLUDED_)
+namespace sc_core {
+class vcd_trace;
+class sc_trace_file_base
+  : public sc_trace_file
+{
+public:
+    enum vcd_enum {VCD_WIRE=0, VCD_REAL, VCD_EVENT, VCD_TIME, VCD_LAST};
+    virtual void do_initialize() = 0;
+    FILE* fp;
+#if (SYSTEMC_VERSION >= 20171012)
+    sc_time::value_type trace_unit_fs, kernel_unit_fs;
+#else
+    double timescale_unit;
+#endif
+    bool        timescale_set_by_user;
+    std::string filename_;
+    bool        initialized_;
+    bool        trace_delta_cycles_;
+    virtual ~sc_trace_file_base();
+};
+class vcd_trace_file
+  : public sc_trace_file_base
+{
+public:
+    ~vcd_trace_file();
+    std::string obtain_name();
+    virtual void do_initialize();
+    unsigned vcd_name_index;
+#if (SYSTEMC_VERSION >= 20171012)
+    sc_time::value_type previous_time_units_low, previous_time_units_high;
+#else
+    unsigned previous_time_units_low, previous_time_units_high;
+#endif
+    std::vector<vcd_trace*> traces;
+};
+}
+#endif
+
+namespace sc_core {
+//==============================================================================
+// The following block of code is copied from the file sc_vcd_trace.cpp in the
+// SystemC distribution. This code should have been placed in the file
+// sc_vcd_trace.h to allow proper C++ derivation.
+class vcd_trace
+{
+public:
+    vcd_trace(const std::string& name_, const std::string& vcd_name_);
+    virtual void write(FILE* f) = 0;
+    virtual void set_width();
+    virtual bool changed() = 0;
+#if (SYSTEMC_VERSION >= 20171012)
+    virtual void print_variable_declaration_line(FILE* f, const char* scoped_name);
+#else
+    virtual void print_variable_declaration_line(FILE* f);
+#endif
+    void compose_data_line(char* rawdata, char* compdata);
+
+#if (SYSTEMC_VERSION >= 20140417)
+    std::string compose_line(const std::string& data);
+#else
+    std::string compose_line(const std::string data);
+#endif
+    virtual ~vcd_trace();
+    const std::string name;
+    const std::string vcd_name;
+#if (SYSTEMC_VERSION >= 20171012)
+    vcd_trace_file::vcd_enum vcd_var_type;
+#else
+    const char* vcd_var_typ_name;
+#endif
+    int bit_width;
+};
+}
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+namespace ac_tracing {
+
+//==============================================================================
+// TRACING SUPPORT FOR AC_INT
+template <int W, bool S>
+class vcd_ac_int_trace : public sc_core::vcd_trace
+{
+public:
+  vcd_ac_int_trace(const ac_int<W,S> &object_, const std::string& name_, const std::string& vcd_name_) :
+    vcd_trace(name_, vcd_name_), object(object_)
+  {
+#if (SYSTEMC_VERSION >= 20171012)
+    vcd_var_type = sc_core::vcd_trace_file::VCD_WIRE;
+#else
+    vcd_var_typ_name = "wire"; // SystemC does not expose vcd_types[] in sc_vcd_trace.h
+#endif
+    bit_width = W; // bit_width defined in base class 'vcd_trace'
+  }
+
+  virtual void write(FILE* f) {
+    // The function to_string(AC_BIN) returns a string with the zero-radix prefix (i.e. "0b").
+    // Strip that prefix off because compose_line will add its own.
+    std::fprintf(f, "%s", compose_line(((ac_int<W,false>)object).to_string(AC_BIN,true).substr(3)).c_str());
+    old_value = object;
+  }
+
+  virtual void set_width() { bit_width = W; }
+
+  // Comparison function needs to be pure virtual too
+  virtual bool changed() { return !(object == old_value); }
+
+  virtual ~vcd_ac_int_trace() {}
+protected:
+  const ac_int<W,S> &object;
+  ac_int<W,S>        old_value;
+};
+
+template <int W, bool S>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_int<W,S> &a, const std::string &name)
+{
+  using namespace sc_core;
+  if (tf) {
+    vcd_trace *t = (vcd_trace*) new vcd_ac_int_trace<W,S>(a,name,((vcd_trace_file*)tf)->obtain_name());
+    ((vcd_trace_file*)tf)->traces.push_back(t);
+  }
+}
+//==============================================================================
+
+#if !defined(__AC_FIXED_MTI_H)
+// The ac_fixed.h shipped with ModelSim/QuestaSim has a stub for sc_trace() for ac_fixed so
+// this code is not used. The stub should be removed in a future release of the simulator.
+#if defined(__AC_FIXED_H) && !defined(SC_TRACE_AC_FIXED)
+#define SC_TRACE_AC_FIXED
+//==============================================================================
+// TRACING SUPPORT FOR AC_FIXED
+template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_fixed<W,I,S,Q,O> &a, const std::string &name)
+{
+  const int iv_N = (W+31+!S)/32;
+  typedef typename ac_private::template iv<iv_N> CommonBase_t;
+  sc_trace(tf, *(const ac_int<W,S>*)(const CommonBase_t*) &a, name);
+}
+//==============================================================================
+#endif
+#endif
+
+#if defined(__AC_FLOAT_H) && !defined(SC_TRACE_AC_FLOAT)
+#define SC_TRACE_AC_FLOAT
+//==============================================================================
+// TRACING SUPPORT FOR AC_FLOAT
+template<int W, int I, int E, ac_q_mode Q>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_float<W,I,E,Q> &a, const std::string &name)
+{
+  sc_trace(tf, a.m, name + ".m");
+  sc_trace(tf, a.e, name + ".e");
+}
+//==============================================================================
+#endif
+
+#if defined(__AC_STD_FLOAT_H) && !defined(SC_TRACE_AC_STD_FLOAT)
+#define SC_TRACE_AC_STD_FLOAT
+//==============================================================================
+// TRACING SUPPORT FOR AC_STD_FLOAT
+template<int W, int E>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_std_float<W,E> &a, const std::string &name)
+{
+  sc_trace(tf, a.data(), name + ".d");
+}
+//==============================================================================
+//==============================================================================
+// TRACING SUPPORT FOR AC_IEEE_FLOAT
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary16> &a, const std::string &name)
+{
+  sc_trace(tf, a.data(), name + ".d");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary32> &a, const std::string &name)
+{
+  sc_trace(tf, *(const int*) &a.data(), name + ".d");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary64> &a, const std::string &name)
+{
+  sc_trace(tf, *(const long long*) &a.data(), name + ".d");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary128> &a, const std::string &name)
+{
+  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
+  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary256> &a, const std::string &name)
+{
+  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
+  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
+  sc_trace(tf, ((const long long*) &a.data())[2], name + ".d2");
+  sc_trace(tf, ((const long long*) &a.data())[3], name + ".d3");
+}
+// TRACING SUPPORT FOR AC::BFLOAT16
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac::bfloat16 &a, const std::string &name)
+{
+  sc_trace(tf, a.data(), name + ".d");
+}
+//==============================================================================
+#endif
+
+#if defined(__AC_COMPLEX_H) && !defined(SC_TRACE_AC_COMPLEX)
+#define SC_TRACE_AC_COMPLEX
+//==============================================================================
+// TRACING SUPPORT FOR AC_COMPLEX
+template<typename T>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_complex<T> &a, const std::string &name)
+{
+  sc_trace(tf, a.real(), name + ".r");
+  sc_trace(tf, a.imag(), name + ".i");
+}
+#endif
+
+}  // namespace ac_tracing
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+namespace sc_core {
+#ifdef __AC_NAMESPACE
+  using __AC_NAMESPACE::ac_tracing::sc_trace;
+#else
+  using ac_tracing::sc_trace;
+#endif
+}
+
+#endif
diff --git a/hls4ml/templates/quartus/ac_types/ac_std_float.h b/hls4ml/templates/quartus/ac_types/ac_std_float.h
index 3b335b971b..25ce8afc38 100644
--- a/hls4ml/templates/quartus/ac_types/ac_std_float.h
+++ b/hls4ml/templates/quartus/ac_types/ac_std_float.h
@@ -1,2318 +1,2318 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2018-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*  Source:         ac_std_float.h
- *  Description:    class for floating point operation handling in C++
- *  Author:         Andres Takach, Ph.D.
-
-Overview: this header defines three classes
-
-  ac_ieee_float<Format>
-    Meant to store floats in IEEE standard binary format
-    Format indicate width:
-      binary16: (half float) uses short
-      binary32: (float) uses int
-      binary64: (double) uses array of long long with one element
-      binary128: (long double in some platforms) uses array of long long with two elements
-      binary256: uses array of long long with four elements
-
-  ac::bfloat16
-    Implements Google's tensorflow::bfloat16
-    Stores data as "short"
-
-  ac_std_float<W,E>
-    Superset of ac_ieee_float in that any bit width and exponent width is
-      allowed
-    This is used by ac_ieee_float and ac::bfloat16
-
-    Uses an ac_int<W,true> that holds the bit pattern for a standard (IEEE) style binary
-    float:
-         1) sign-magnitude representation, sign is MSB
-         2) mantissa (significand) with implied bit for normal numbers
-         3) E is not restricted to IEEE widths, another class ac_ieee_float does that
-
-    Provides easy way to conver to/from the closest covering ac_float:
-      Constructor from ac_float
-        Most two negative exponents of ac_float are not representable: shift
-          significand futher to the right (for now no attempt to round)
-        Most negative mantissa of ac_float (in two's complement) when converted
-          to sign-magnitute requires a right shift (add +1 to exponent)
-          If exponent is already max, two alternatives:
-            - "saturate" (store most negative number)
-            - Store as -Inf  (currently this option not available)
-        Exponent is offset
-        Mantissa implied bit is removed from normal numbers
-
-      Explicit convertion to_ac_float
-        Ignores exceptions (Inf, NaN)
-        Does inverse as above to obtain ac_float
-*/
-
-#ifndef __AC_STD_FLOAT_H
-#define __AC_STD_FLOAT_H
-#include <ac_float.h>
-#include <cstring>
-// Inclusion of cmath undefs all macros such as signbit etc that some parsers may define for C
-#include <cmath>
-
-#ifdef __SYNTHESIS__
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-#undef AC_IEEE_FLOAT_USE_BUILTIN
-#endif
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-// For now make data members public since SCVerify needs it
-//#ifdef __AC_MAKE_PRIVATE_DATA_PUBLIC
-#if 1
-#define __AC_DATA_PRIVATE public:
-#else
-#define __AC_DATA_PRIVATE private:
-#endif
-
-namespace ac_private {
-  template<bool cond>
-  struct check_rounding { enum {Only_symmetrical_roundings_or_truncations_supported}; };
-  template<> struct check_rounding<false> {};
-
-  template<ac_q_mode Q>
-  void check_supported() {
-    // only symmetrical roundings supported
-    const bool supported = Q==AC_RND_CONV || Q==AC_TRN_ZERO || Q==AC_RND_INF || Q == AC_RND_CONV_ODD;
-#if __cplusplus > 199711L
-    static_assert(supported, "Only symmetrical roundings/truncations supported");
-#else
-    (void) check_rounding<supported>::Only_symmetrical_roundings_or_truncations_supported;
-#endif
-  }
-
-  template<bool cond>
-  struct check_rounding2 { enum {Only_round_to_even_supported_when_using_BUILTIN}; };
-  template<> struct check_rounding2<false> {};
-
-  template<ac_q_mode Q>
-  void check_supported2() {
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-    const bool supported = Q==AC_RND_CONV;
-#if __cplusplus > 199711L
-    static_assert(supported, "Only round to even supported");
-#else
-    (void) check_rounding2<supported>::Only_round_to_even_supported_when_using_BUILTIN;
-#endif
-#endif
-  }
-
-  template<typename T, typename T2>
-  struct rt_closed_T {
-  };
-  template<typename T>
-  struct rt_closed_T<T,T> {
-    typedef T type;
-  };
-}
-
-namespace ac {
-  #pragma hls_design ccore
-  #pragma hls_ccore_type sequential
-  template<int W>
-  void fx_div(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
-    ac_int<W+2,true> R = op1;
-    bool R_neg = false;
-    ac_int<W,false> D = op2;
-    ac_int<W+1,true> neg_D = -D;
-    ac_int<W+2,false> Q = 0;
-    for(int i=0; i < W+2; i++) {
-      // take MSB of N, shift it in from right to R
-      R += ( R_neg ? (ac_int<W+1,true>) D : neg_D );
-      Q = (Q << 1) | ((R >= 0) & 1);
-      R_neg = R[W];
-      R <<= 1;
-    }
-    quotient = Q;
-    exact = !R | R_neg & (R >> 1) == neg_D;
-  }
-
-  template<int W>
-  void fx_div_sim(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
-    // need to compute extra rnd bit,
-    //   +2 because we may need to shift left by 1 (mant divisor > mant dividend)
-    ac_int<2*W+1,false> op1_mi = op1;
-    op1_mi <<= W+1;
-    // +1 bit to compute rnd bit
-    quotient = (op1_mi / op2);
-    exact = !(op1_mi % op2);
-  }
-
-  #pragma hls_design ccore
-  #pragma hls_ccore_type sequential
-  template<int W, int WR>
-  bool fx_sqrt( ac_int<W,false> x, ac_int<WR,false> &sqrt) {
-    // x is ac_fixed<W,2,false>, sqrt is ac_fixed<WR,1,false>
-    const bool W_odd = W&1;
-    const int ZW = W + W_odd;  // make it even
-    ac_int<ZW,false> z = x;
-    z <<= W_odd;
-    // masks used only to hint synthesis on precision
-    ac_int<WR+2,false> mask_d = 0;
-    ac_int<WR+2,false> d = 0;
-    ac_int<WR,false> r = 0;
-    unsigned int z_shift = ZW-2;
-    for(int i = WR-1; i >= 0; i--) {
-      r <<= 1;
-      mask_d = (mask_d << 2) | 0x3;
-      d = (mask_d & (d << 2)) | ((z >> z_shift) & 0x3 );
-      ac_int<WR+2,false> t = d - (( ((ac_int<WR+1,false>)r) << 1) | 0x1);
-      if( !t[WR+1] ) {  // since t is unsigned, look at MSB
-        r |= 0x1;
-        d = mask_d & t;
-      }
-      z <<= 2;
-    }
-
-    bool rem = (d != 0) || ((z >> 2*W) != 0);
-    sqrt = r;
-    return rem;
-  }
-}
-
-#ifndef AC_STD_FLOAT_FX_DIV_OVERRIDE
-#ifdef __SYNTHESIS__
-#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div
-#else
-#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div_sim
-#endif
-#endif
-
-template<int W, int E> class ac_std_float;
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
-#define AC_STD_FLOAT_OVERRIDE_NS ::AC_STD_FLOAT_OVERRIDE_NAMESPACE::
-namespace AC_STD_FLOAT_OVERRIDE_NAMESPACE {
-#ifdef __AC_NAMESPACE
-  using __AC_NAMESPACE::ac_q_mode;
-  using __AC_NAMESPACE::ac_std_float;
-#endif
-#else
-#define AC_STD_FLOAT_OVERRIDE_NS
-#endif
-
-#ifdef AC_STD_FLOAT_ADD_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_ADD_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
-#endif
-
-#ifdef AC_STD_FLOAT_MULT_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_MULT_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
-#endif
-
-#ifdef AC_STD_FLOAT_DIV_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_DIV_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
-#endif
-
-#ifdef AC_STD_FLOAT_FMA_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_FMA_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2, const ac_std_float<W,E> &op3);
-#endif
-
-#ifdef AC_STD_FLOAT_SQRT_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_SQRT_OVERRIDE(const ac_std_float<W,E> &op);
-#endif
-
-#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
-}
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-namespace ac {
-  inline void copy_bits(float a, float *b) { *b = a; }
-  inline void copy_bits(double a, double *b) { *b = a; }
-
-  inline void copy_bits(short a, short *b) { *b = a; }
-  inline void copy_bits(const ac_int<16,true> &a, short *b) { *b = (short) a.to_int(); }
-  inline void copy_bits(short a, ac_int<16,true> *b) { *b = ac_int<16,true>(a); }
-  inline void copy_bits(int a, int *b) { *b = a; }
-  inline void copy_bits(const ac_int<32,true> &a, int *b) { *b = a.to_int(); }
-  inline void copy_bits(int a, ac_int<32,true> *b) { *b = ac_int<32,true>(a); }
-  inline void copy_bits(long long a, long long *b) { *b = a; }
-  inline void copy_bits(const ac_int<64,true> &a, long long *b) { *b = a.to_int64(); }
-  inline void copy_bits(long long a, ac_int<64,true> *b) { *b = ac_int<64,true>(a); }
-  inline void copy_bits(const long long a[2], long long (*b)[2]) {
-    (*b)[0] = a[0];
-    (*b)[1] = a[1];
-  }
-  inline void copy_bits(const ac_int<128,true> &a, long long (*b)[2]) {
-    (*b)[0] = a.to_int64();
-    (*b)[1] = a.slc<64>(64).to_int64();
-  }
-  inline void copy_bits(const long long a[2], ac_int<128,true> *b) {
-    *b = 0;
-    b->set_slc(0,ac_int<64,true>(a[0]));
-    b->set_slc(64,ac_int<64,true>(a[1]));
-  }
-  inline void copy_bits(const long long a[4], long long (*b)[4]) {
-    (*b)[0] = a[0];
-    (*b)[1] = a[1];
-    (*b)[2] = a[2];
-    (*b)[3] = a[3];
-  }
-  inline void copy_bits(const ac_int<256,true> &a, long long (*b)[4]) {
-    (*b)[0] = a.to_int64();
-    (*b)[1] = a.slc<64>(64).to_int64();
-    (*b)[2] = a.slc<64>(128).to_int64();
-    (*b)[3] = a.slc<64>(192).to_int64();
-  }
-  inline void copy_bits(const long long a[4], ac_int<256,true> *b) {
-    *b = 0;
-    b->set_slc(0,ac_int<64,true>(a[0]));
-    b->set_slc(64,ac_int<64,true>(a[1]));
-    b->set_slc(128,ac_int<64,true>(a[2]));
-    b->set_slc(192,ac_int<64,true>(a[3]));
-  }
-  inline void copy_bits(float f, int *x);
-  inline void copy_bits(double f, long long *x);
-  inline void copy_bits(int x, float *f);
-  inline void copy_bits(long long x, double *f);
-
-  inline void copy_bits(float f, ac_int<32,true> *x) {
-    int x_i;
-    copy_bits(f, &x_i);
-    *x = x_i;
-  }
-  inline void copy_bits(double f, ac_int<64,true> *x) {
-    long long x_i;
-    copy_bits(f, &x_i);
-    *x = x_i;
-  }
-  inline void copy_bits(const ac_int<32,true> &x, float *f) { copy_bits(x.to_int(), f); }
-  inline void copy_bits(const ac_int<64,true> &x, double *f) { copy_bits(x.to_int64(), f); }
-}
-
-enum ac_ieee_float_format { binary16, binary32, binary64, binary128, binary256};
-
-// Forward declarations for ac_ieee_float and bfloat16
-template<ac_ieee_float_format Format>
-class ac_ieee_float;
-namespace ac {
-  class bfloat16;
-}
-
-template<int W, int E>
-class ac_std_float {
-__AC_DATA_PRIVATE
-  ac_int<W,true> d;
-public:
-  static const int width = W;
-  static const int e_width = E;
-  static const int mant_bits = W - E - 1;
-  static const int exp_bias = (1 << (E-1)) - 1;
-  static const int min_exp = -exp_bias + 1;
-  static const int max_exp = exp_bias;
-  static const int mu_bits = mant_bits + 1;
-private:
-  typedef ac_int<mu_bits,false> mu_t;
-  typedef ac_int<mu_bits+1,false> mu1_t;
-  typedef ac_int<mu_bits+2,false> mu2_t;
-  typedef ac_int<mu_bits+1,true> m_t;   // mantissa in two's complement representation
-public:
-  typedef ac_int<E,true> e_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  static ac_std_float nan() {
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits-1, ac_int<e_width+1,true>(-1));
-    return r;
-  }
-  static ac_std_float inf() {
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits, ac_int<e_width,true>(-1));
-    return r;
-  }
-  static ac_std_float denorm_min() {   // smallest positive non zero value (subnorm if supported)
-    ac_std_float r;
-    r.d = 1;
-    return r;
-  }
-  static ac_std_float min() {   // smallest NORMAL positive non zero value
-    ac_std_float r;
-    r.d = 0;
-    r.d[width-1-e_width] = true;
-    return r;
-  }
-  static ac_std_float max() {   // largest pos finite value
-    ac_std_float r;
-    r.d = -1;
-    r.d[width-1] = false;
-    r.d[width-1-e_width] = false;
-    return r;
-  }
-  static ac_std_float epsilon() {
-    ac_int<e_width,true> exp = -mant_bits + exp_bias;
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits, exp);
-    return r;
-  }
-  ac_std_float() {}
-  ac_std_float(const ac_std_float &f) : d(f.d) {}
-  template<int WR, ac_q_mode QR>
-  ac_std_float<WR,E> convert() const {
-    ac_private::check_supported<QR>();
-    ac_std_float<WR,E> r;
-    if(W <= WR) {
-      r.d = 0;
-      r.d.set_slc(WR-W, d);
-    } else {
-      typedef ac_std_float<WR,E> r_t;
-      const int r_mant_bits = r_t::mant_bits;
-      const int r_mu_bits = r_t::mu_bits;
-      e_t f_e = d.template slc<E>(mant_bits);
-      bool f_normal = !!f_e;
-      mu_t mu = d;
-      mu[r_mant_bits] = f_normal;
-      ac_fixed<r_mu_bits+1,mu_bits+1,false,QR> r_rnd = mu;
-      bool rnd_ovf = r_rnd[r_mu_bits];
-      ac_int<r_mant_bits,false> m_r = r_rnd.template slc<r_mant_bits>(0);
-      e_t e_r = f_e + rnd_ovf;
-      r.d = m_r;
-      r.d.set_slc(r_mant_bits, e_r);
-      r.d[WR-1] = d[W-1];
-    }
-    return r;
-  }
-
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
-    static const bool rnd = QFX!=AC_TRN && QFX!=AC_TRN_ZERO;
-    static const bool need_rnd_bit = QFX != AC_TRN;
-    static const bool need_rem_bits = need_rnd_bit && QFX != AC_RND;
-    static const bool need_ovf = OFX != AC_WRAP;
-    static const int t_width = AC_MAX(mu_bits+1, WFX+!SFX) + need_rnd_bit + need_ovf;
-
-    bool f_sign, f_normal, f_zero, f_inf, f_nan;
-    mu_t f_mu;
-    e_t f_e;
-    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
-    if(map_inf) {
-      ac_fixed<WFX,IFX,SFX,QFX,OFX> rv;
-      if(f_sign)
-        rv.template set_val<AC_VAL_MIN>();
-      else
-        rv.template set_val<AC_VAL_MAX>();
-      return rv; 
-    }
-    AC_ASSERT(!f_inf && !f_nan, "Expects finite float (not Nan or Inf)");
-    m_t f_m = f_sign ? m_t(-f_mu) : m_t(f_mu);
-    typedef ac_int<t_width,true> t_t;
-    typedef ac_int<t_width+need_rem_bits,true> t2_t;
-    t_t t = f_m;
-    t <<= need_rnd_bit;
-    static const int lsb_src = -mant_bits;
-    static const int lsb_trg = IFX-WFX;
-    int rshift = lsb_trg - lsb_src - (int)f_e;
-
-    bool sticky_bit_rnd = false;
-    bool rshift_neg = rshift < 0;
-    if(need_rem_bits) {
-      t_t shifted_out_bits = t;
-      typedef ac_int< ac::template nbits< AC_MAX(lsb_trg - lsb_src - min_exp,1) >::val, false> shift_ut;
-      shifted_out_bits &= ~(t_t(0).bit_complement() << (shift_ut) rshift);
-      sticky_bit_rnd = !!shifted_out_bits & !rshift_neg;
-    }
-    bool ovf = false;
-    if(need_ovf) {
-      t_t shifted_out_bits = t < 0 ? t_t(~t) : t;
-      // shift right by -rshift + 1
-      //   +1 is OK since added extra MSB
-      typedef ac_int< ac::template nbits< AC_MAX(-(lsb_trg - lsb_src - max_exp + 1),1) >::val, false> shift_ut;
-      shifted_out_bits &= ~((t_t(0).bit_complement() >> 2) >> (shift_ut) ~rshift);
-      ovf = !!shifted_out_bits & rshift_neg;
-    }
-
-    t >>= rshift;
-
-    t[t_width-1] = t[t_width-1] ^ (ovf & (t[t_width-1] ^ f_sign));
-    t[t_width-2] = t[t_width-2] ^ (ovf & (t[t_width-2] ^ !f_sign));
-    t2_t t2 = t;
-    if(need_rem_bits) {
-      t2 <<= 1;
-      t2[0] = t2[0] | sticky_bit_rnd;
-    }
-
-    ac_fixed<WFX,WFX+need_rnd_bit+need_rem_bits,SFX,QFX,OFX> ri = t2;
-    ac_fixed<WFX,IFX,SFX,QFX,OFX> r = 0;
-    r.set_slc(0,ri.template slc<WFX>(0));
-    return r;
-  }
-
-  template<int W2>
-  explicit ac_std_float(const ac_std_float<W2,E> &f) {
-    *this = f.template convert<W,AC_RND_CONV>();
-  }
-  template<int WR, int ER, ac_q_mode QR>
-  ac_std_float<WR,ER> convert() const {
-    ac_private::check_supported<QR>();
-    typedef ac_std_float<WR,ER> r_t;
-    typedef typename r_t::e_t r_e_t;
-    int const r_mu_bits = r_t::mu_bits;
-    int const r_mant_bits = r_t::mant_bits;
-    int const r_min_exp = r_t::min_exp;
-    int const r_max_exp = r_t::max_exp;
-    int const r_exp_bias = r_t::exp_bias;
-    bool f_sign, f_normal, f_zero, f_inf, f_nan;
-    mu_t f_mu;
-    e_t f_e;
-    r_t r;
-    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
-    int exp = f_e;
-    ac_fixed<r_mu_bits+1, mu_bits+1,false,QR> r_rnd;
-    if(ER >= E) {
-      if(ER > E && !f_normal) {
-        int ls = f_mu.leading_sign();
-        int max_shift_left = f_e - r_min_exp + 1;
-        bool shift_exponent_limited = ls >= max_shift_left;
-        int shift_l = shift_exponent_limited ? max_shift_left : ls;
-        f_mu <<= shift_l;
-        exp -= shift_l;
-      }
-      r_rnd = f_mu;
-    } else {
-      int shift_r = r_min_exp - f_e;
-      typedef ac_fixed<r_mu_bits+1,mu_bits,false> t_t;
-      t_t r_t = f_mu;
-      bool sticky_bit = !!(f_mu & ~((~mu_t(0)) << mant_bits-r_mant_bits-1));
-      if(shift_r > 0) {
-        t_t shifted_out_bits = r_t;
-        shifted_out_bits &= ~((~t_t(0)) << shift_r);
-        sticky_bit |= !!shifted_out_bits;
-        r_t >>= shift_r;
-        exp += shift_r;
-      }
-      ac_fixed<r_mu_bits+2, mu_bits,false> r_t2 = r_t;
-      r_t2[0] = sticky_bit;
-      r_rnd = r_t2;
-    }
-    bool rnd_ovf = r_rnd[r_mu_bits];
-    ac_int<r_mant_bits,false> r_m = r_rnd.template slc<r_mant_bits>(0);
-    bool r_normal = r_rnd[r_mant_bits] | rnd_ovf;
-    exp += rnd_ovf;
-    bool exception = f_inf | f_nan | (exp > r_max_exp);
-    r_e_t r_e = exception ? -1 : (f_zero | !r_normal) ? 0 : exp + r_exp_bias;
-    if(exception) {
-      r_m = 0;
-      r_m[r_mant_bits-1] = f_nan;
-    }
-    r.d = r_m;
-    r.d.set_slc(r_mant_bits, r_e);
-    r.d[WR-1] = d[W-1];
-    return r;
-  }
-  template<int W2,int E2>
-  explicit ac_std_float(const ac_std_float<W2,E2> &f) {
-    *this = f.template convert<W,E,AC_RND_CONV>();
-  }
-  template<ac_ieee_float_format Format>
-  explicit ac_std_float(const ac_ieee_float<Format> &f);
-
-  explicit ac_std_float(const ac::bfloat16 &f);
-
-  template<ac_q_mode Q>
-  explicit ac_std_float(const ac_float<mu_bits+1,2,E,Q> &f) {
-    bool sign = f.mantissa() < 0;
-    m_t m_s = f.m.template slc<mu_bits+1>(0);
-    mu1_t m_u = sign ? (mu1_t) -m_s : (mu1_t) m_s;
-    bool most_neg_m = m_u[mu_bits];
-    bool is_max_exp = f.exp() == (1 << (E-1)) - 1;
-    ac_int<E,true> e = f.exp() + exp_bias + (most_neg_m & !is_max_exp);
-    mu_t m = m_u | ac_int<1,true>(most_neg_m & is_max_exp);
-    m[mant_bits] = m[mant_bits] | most_neg_m;
-    bool exp_dont_map = !e | e==-1;
-    m >>= !e;
-    m >>= 2*(e==-1);
-    // exp_dont_map guarantees subnornal => e = 0
-    e &= ac_int<1,true>(!exp_dont_map & !!m);
-    d = m.template slc<mant_bits>(0);
-    d.set_slc(mant_bits, e);
-    d[W-1] = sign;
-  }
-  template<ac_q_mode Q, int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  void assign_from(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
-    ac_private::check_supported<Q>();
-    bool sign = fx < 0.0;
-    ac_fixed<WFX+1,2,SFX> x = 0;
-    x.set_slc(0,fx.template slc<WFX+1>(0));
-    bool all_sign;
-    int ls = x.leading_sign(all_sign);
-    int max_shift_left = IFX-1 - min_exp + 1;
-    bool shift_exponent_limited = ls >= max_shift_left;
-    int shift_l = shift_exponent_limited ? max_shift_left : ls;
-    ac_fixed<WFX+1,2,false> x_u = sign ? (ac_fixed<WFX+1,2,false>) -x :  (ac_fixed<WFX+1,2,false>) x;
-    x_u <<= shift_l;
-    int exp = IFX-1;
-    exp -= shift_l;
-    ac_fixed<mu_bits+1,2,false,Q> m_rnd = x_u;
-    mu1_t m_u = 0;  m_u.set_slc(0, m_rnd.template slc<mu_bits+1>(0));
-    bool shiftr1 = m_u[mu_bits];  // msb
-    bool r_normal = m_u[mu_bits] | m_u[mu_bits-1];
-    m_u >>= shiftr1;
-    exp += shiftr1;
-    bool fx_zero = all_sign & !sign;
-    bool r_inf = (exp > max_exp) & !fx_zero;
-    if(Q==AC_TRN_ZERO) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_u |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    e_t e = r_inf ? -1 : (!r_normal) ? 0 : exp + exp_bias;
-    m_u &= ac_int<1,true>(!r_inf);
-    e &= ac_int<1,true>(r_normal);
-    d = m_u.template slc<mant_bits>(0);
-    d.set_slc(mant_bits, e);
-    d[W-1] = sign;
-  }
-  template<ac_q_mode Q, int WI, bool SI>
-  void assign_from(const ac_int<WI,SI> &x) {
-    this->template assign_from<Q>(ac_fixed<WI,WI,SI>(x));
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  explicit ac_std_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
-    assign_from<AC_RND_CONV>(fx);
-  }
-  explicit ac_std_float(float f) {
-    const int w_bits = sizeof(f)*8;
-    const int m_bits = std::numeric_limits<float>::digits;
-    const int e_bits = w_bits - m_bits;
-    ac_int<w_bits,true> t_i;
-    ac::copy_bits(f, &t_i);
-    ac_std_float<w_bits,e_bits> t;
-    t.set_data(t_i);
-    *this = ac_std_float(t);
-  }
-  explicit ac_std_float(double f) {
-    const int w_bits = sizeof(f)*8;
-    const int m_bits = std::numeric_limits<double>::digits;
-    const int e_bits = w_bits - m_bits;
-    ac_int<w_bits,true> t_i;
-    ac::copy_bits(f, &t_i);
-    ac_std_float<w_bits,e_bits> t;
-    t.set_data(t_i);
-    *this = ac_std_float(t);
-  }
-  explicit ac_std_float(int x) {
-    *this = ac_std_float(ac_fixed<32,32,true>(x));
-  }
-  explicit ac_std_float(long long x) {
-    *this = ac_std_float(ac_fixed<64,64,true>(x));
-  }
-  const ac_int<W,true> &data() const { return d; }
-  void set_data(const ac_int<W,true> &data, bool assert_on_nan=false, bool assert_on_inf=false) {
-    d = data;
-    if(assert_on_nan)
-      AC_ASSERT(!isnan(), "Float is NaN");
-    if(assert_on_inf)
-      AC_ASSERT(!isinf(), "Float is Inf");
-  }
-  int fpclassify() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    if(e) {
-      if(e == -1)
-        return !(ac_int<mant_bits,false>)d ? FP_INFINITE : FP_NAN;
-      else
-        return FP_NORMAL;
-    }
-    else
-      return !(ac_int<mant_bits,false>)d ? FP_ZERO : FP_SUBNORMAL;
-  }
-  bool isfinite() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    return e != -1;
-  }
-  bool isnormal() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    return (e || !(ac_int<mant_bits,false>)d)&& e != -1;
-  }
-  bool isnan() const {
-    if(isfinite())
-      return false;
-    ac_int<mant_bits,false> m = d;
-    return !!m;
-  }
-  bool isinf() const {
-    if(isfinite())
-      return false;
-    ac_int<mant_bits,false> m = d;
-    return !m;
-  }
-  const ac_float<mant_bits+2,2,E,AC_TRN> to_ac_float() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    bool normal = !!e;
-    bool sign = d[W-1];
-    bool inf = e == -1;
-    ac_int<mant_bits,false> m = d;
-    ac_int<mant_bits+1,false> m1 = m;
-    m1[mant_bits] = normal;
-    ac_int<mant_bits+2,true> m_s = sign ? -m1 : (ac_int<mant_bits+2,true>) m1;
-    ac_fixed<mant_bits+2,2,true> fx = 0;
-    fx.set_slc(0, m_s);
-    e -= exp_bias;
-    // if number is subnormal, e will be MIN_EXP + 1 (10...01), but it needs to be
-    //   MIN_EXP + 2  (10...010)
-    e[0] = e[0] & normal;
-    e[1] = e[1] | !normal;
-    // normalization by at most 2 places
-    bool shiftl1 = !(fx[mant_bits+1] ^ fx[mant_bits]);
-    bool shiftl2 = shiftl1 & !(fx[mant_bits+1] ^ fx[mant_bits-1]);
-    fx <<= shiftl1;
-    fx <<= shiftl2;
-    e -= shiftl1 + shiftl2;
-    e = inf ? value<AC_VAL_MAX>(e) : e;
-    fx = inf ? (sign ? value<AC_VAL_MIN>(fx) : value<AC_VAL_MAX>(fx)) : fx;
-    return ac_float<mant_bits+2,2,E,AC_TRN>(fx, e, false);
-  }
-  float to_float() const {
-    ac_std_float<32,8> t(*this);
-    float f;
-    ac::copy_bits(t.d, &f);
-    return f;
-  }
-  double to_double() const {
-    ac_std_float<64,11> t(*this);
-    double f;
-    ac::copy_bits(t.d, &f);
-    return f;
-  }
-private:
-  void extract(mu_t &m, e_t &e, bool &sign, bool &normal, bool &zero, bool &inf, bool &nan, bool biased_exp=false, bool no_subnormals=false) const {
-    e = d.template slc<E>(mant_bits);
-    bool exception = e == -1;
-    normal = !!e | no_subnormals;
-    m = d;
-    bool m_zero = !m.template slc<mant_bits>(0);
-    zero = (!e) & (no_subnormals | m_zero);
-    m[mant_bits] = !!e;
-    if(!biased_exp) {
-      e -= exp_bias;
-      e += !normal;
-    }
-    sign = d[W-1];
-    inf = exception & m_zero;
-    nan = exception & !m_zero;
-  }
-public:
-  static ac_std_float zero() {
-    ac_std_float r;
-    r.d = 0;
-    return r;
-  }
-  static ac_std_float one() {
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits, ac_int<E,false>(exp_bias));
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float add_generic(const ac_std_float &op2) const {
-    ac_private::check_supported<QR>();
-    // +1 for possible negation, +1 for bit growth due to addition
-    const int tr_t_iwidth = mu_bits + 1 + 1;
-    // extra bit for rounding, extra bit for left shift
-    const int tr_t_width = tr_t_iwidth + 1 + 1;
-    typedef ac_fixed<tr_t_width,tr_t_iwidth,true> add_t;
-    typedef ac_fixed<mu_bits,mu_bits+1,false> r_un_t;
-    e_t op1_e, op2_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan;
-    mu_t op1_mu, op2_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    m_t op1_m = op1_sign ? m_t(-op1_mu) : m_t(op1_mu);
-    op1_m &= m_t(No_SubNormals & op1_zero ? 0 : -1);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    m_t op2_m = op2_sign ? m_t(-op2_mu) : m_t(op2_mu);
-    op2_m &= m_t(No_SubNormals & op2_zero ? 0 : -1);
-
-    unsigned op1_e_b = ac_int<E,false>(op1_e) + !op1_normal;
-    unsigned op2_e_b = ac_int<E,false>(op2_e) + !op2_normal;
-    int e_dif = op1_e_b - op2_e_b;
-    bool e1_lt_e2 = e_dif < 0;
-    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
-
-    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
-    m_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
-    add_t shifted_out_bits = op_lshift;
-    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
-    bool sticky_bit = !!shifted_out_bits;
-
-    op_lshift >>= (unsigned) e_dif;
-    add_t add_r = op_lshift + op_no_shift;
-    int exp = ( (e1_lt_e2 & !op2_zero) | op1_zero ? op2_e_b : op1_e_b);
-    bool all_sign;
-    int ls = add_r.leading_sign(all_sign);
-    bool r_zero = !add_r[0] & all_sign;
-    // +1 to account for bit growth of add_r
-    int max_shift_left = exp + (- min_exp - exp_bias + 1);
-    bool shift_exponent_limited = ls >= max_shift_left;
-    int shift_l = shift_exponent_limited ? max_shift_left : ls;
-    add_r <<= shift_l;
-    add_r[0] = add_r[0] | sticky_bit;
-    ac_fixed<mu_bits+1,mu_bits+2,true,QR> r_rnd = add_r;
-    typedef ac_int<mu_bits+1,false> t_h;
-    t_h t = add_r.to_ac_int();
-    bool rnd_ovf = QR == AC_RND_CONV && t == t_h(-1);
-    bool r_sign = r_rnd[mu_bits] ^ rnd_ovf;
-    bool shift_r = rnd_ovf | (r_sign & !r_rnd.template slc<mu_bits>(0));
-    r_un_t r_un =  r_sign ? (r_un_t) -r_rnd : (r_un_t) r_rnd;
-    // get rid of implied bit, assign to ac_int
-    bool r_normal = r_un[mant_bits] | shift_r;
-    r_zero |= No_SubNormals & !r_normal;
-    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
-    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 1) + shift_r;
-    bool r_inf = exp > max_exp + exp_bias;
-    if(QR==AC_TRN_ZERO) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    bool r_nan = op1_nan | op2_nan | ((op1_inf & op2_inf) & (op1_sign ^ op2_sign));
-    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float add(const ac_std_float &op2) const {
-#ifndef AC_STD_FLOAT_ADD_OVERRIDE
-    return add_generic<QR,No_SubNormals>(op2);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_ADD_OVERRIDE<QR,No_SubNormals>(*this, op2);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float sub(const ac_std_float &op2) const {
-    return add<QR,No_SubNormals>(-op2);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float mult_generic(const ac_std_float &op2) const {
-    ac_private::check_supported<QR>();
-    e_t op1_e, op2_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan;
-    mu_t op1_mu, op2_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    bool r_sign = op1_sign ^ op2_sign;
-    bool r_nan = op1_nan | op2_nan | (op1_inf & op2_zero) | (op1_zero & op2_inf);
-    bool r_zero = op1_zero | op2_zero;  // r_nan takes precedence later on
-    int exp = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
-    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
-    int max_shift_left = exp + (- min_exp - exp_bias + 1);
-    int shift_l = 0;
-    bool shift_l_1 = false;
-    typedef ac_int<mu_bits+1,false> t_h;
-    typedef ac_int<mu_bits-1,false> t_l;
-    t_h p_h;
-    t_l p_l = p;
-    bool r_normal;
-    bool r_inf;
-    ac_fixed<mu_bits,mu_bits+2,false,QR> r_rnd;
-    ac_int<mant_bits,false> m_r;
-    if(max_shift_left >= 0) {
-      r_inf = exp > max_exp + exp_bias;
-      bool exp_is_max = exp == max_exp + exp_bias;
-      bool exp_is_max_m1 = exp == max_exp + exp_bias - 1;
-      unsigned ls = No_SubNormals ? 0 : (unsigned) (op1_normal ? op2_mu : op1_mu).leading_sign();
-      bool shift_exponent_limited = ls >= (unsigned) max_shift_left;
-      shift_l = shift_exponent_limited ? (unsigned) max_shift_left : ls;
-      p <<= (unsigned) shift_l;
-      exp -= shift_l;
-      shift_l_1 = !(shift_exponent_limited | p[2*mu_bits-1]);
-      p = shift_l_1 ? p << 1 : p;
-      exp += !shift_l_1;
-      p_h = p >> (mu_bits-1);
-      p_l &= (t_l(-1) >> shift_l) >> shift_l_1;
-      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
-      p_bef_rnd <<= 1;
-      p_bef_rnd[0] = !!p_l;
-      r_rnd = p_bef_rnd;
-      m_r = r_rnd.template slc<mant_bits>(0);
-      bool rnd_ovf = QR == AC_RND_CONV && p_h == t_h(-1);
-      exp += rnd_ovf;
-      r_inf |= (exp_is_max & (!shift_l_1 | rnd_ovf)) | (exp_is_max_m1 & !shift_l_1 & rnd_ovf);
-      r_normal = r_rnd[mant_bits] | rnd_ovf;
-      r_zero |= !r_normal & No_SubNormals;
-      if(QR==AC_TRN_ZERO) {
-        exp = r_inf ? max_exp + exp_bias : exp;
-        m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-        r_inf = false;
-      }
-    } else {
-      shift_l = max_shift_left;
-      exp -= shift_l;
-      unsigned shift_r_m1 = ~shift_l;
-      p_h = p >> (mu_bits-1);
-      t_h shifted_out_bits = p_h;
-      shifted_out_bits &= ~((~t_h(1)) << shift_r_m1);
-      p_h >>= shift_r_m1;
-      p_h >>= 1;
-      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
-      p_bef_rnd <<= 1;
-      p_bef_rnd[0] = !!p_l | !!shifted_out_bits;
-      r_rnd = p_bef_rnd;
-      m_r = r_rnd.template slc<mant_bits>(0);
-      r_normal = false;
-      r_inf = false;
-      r_zero |= No_SubNormals;
-    }
-    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float mult(const ac_std_float &op2) const {
-#ifndef AC_STD_FLOAT_MULT_OVERRIDE
-    return mult_generic<QR,No_SubNormals>(op2);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_MULT_OVERRIDE<QR,No_SubNormals>(*this, op2);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float div_generic(const ac_std_float &op2) const {
-    ac_private::check_supported<QR>();
-    e_t op1_e, op2_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan;
-    mu_t op1_mu, op2_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    bool r_sign = op1_sign ^ op2_sign;
-    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
-    op1_mu <<= ls_op1;
-    int ls_op2 = No_SubNormals ? 0 : (unsigned) op2_mu.leading_sign();
-    op2_mu <<= ls_op2;
-    int exp = ac_int<E,false>(op1_e) - ac_int<E,false>(op2_e) + !op1_normal - !op2_normal - ls_op1 + ls_op2 + exp_bias;
-    ac_int<mu_bits+2,false> q0 = 0;
-    bool exact = true;
-    bool div_by_zero = op2_zero;
-#ifdef __SYNTHESIS__
-    div_by_zero = false;
-#endif
-    if(!div_by_zero) {
-      AC_STD_FLOAT_FX_DIV_OVERRIDE(op1_mu, op2_mu, q0, exact);
-    }
-    ac_int<mu_bits+3,false> q = q0;
-    q <<= 1;
-    int shift_r = min_exp + exp_bias - exp;
-    bool sticky_bit = !exact;
-    if(shift_r >= 0) {
-      typedef ac_int<mu_bits+3,false> t_t;
-      t_t shifted_out_bits = q;
-      shifted_out_bits &= ~((~t_t(0)) << shift_r);
-      sticky_bit |= !!shifted_out_bits;
-      q >>= shift_r;
-      exp += shift_r;
-    } else {
-      bool shift_l = !q[mu_bits+2];
-      q <<= shift_l;
-      exp -= shift_l;
-    }
-    q[0] = q[0] | sticky_bit;
-    ac_fixed<mu_bits+1,mu_bits+4,false,QR> r_rnd = q;
-    bool rnd_ovf = r_rnd[mu_bits];
-    ac_int<mant_bits,false> m_r = r_rnd.template slc<mant_bits>(0);
-    bool r_normal = r_rnd[mant_bits] | rnd_ovf;
-    bool r_nan = op1_nan | op2_nan | (op1_zero & op2_zero) | (op1_inf & op2_inf);
-    bool r_zero = op1_zero | op2_inf;
-    r_zero |= !r_normal & No_SubNormals;
-    exp += rnd_ovf;
-    bool r_inf0 = op1_inf | op2_zero;  // this is not affected by rounding
-    bool r_inf = (!r_zero & (exp > max_exp + exp_bias)) | r_inf0;
-    if(QR==AC_TRN_ZERO && !r_inf0) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    bool exception = r_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float div(const ac_std_float &op2) const {
-#ifndef AC_STD_FLOAT_DIV_OVERRIDE
-    return div_generic<QR,No_SubNormals>(op2);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_DIV_OVERRIDE<QR,No_SubNormals>(*this, op2);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float fma_generic(const ac_std_float &op2, const ac_std_float &op3) const {
-    ac_private::check_supported<QR>();
-    e_t op1_e, op2_e, op3_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero, op3_normal, op3_sign, op3_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan, op3_inf, op3_nan;
-    mu_t op1_mu, op2_mu, op3_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    op3.extract(op3_mu, op3_e, op3_sign, op3_normal, op3_zero, op3_inf, op3_nan, true, No_SubNormals);
-    if(No_SubNormals)
-      op3_mu &= mu_t(op3_zero ? 0 : -1);
-    bool mult_sign = (op1_sign ^ op2_sign) | (op1_zero & op2_inf) | (op1_inf & op1_zero);
-    bool mult_nan = op1_nan | op2_nan | (op1_zero & op2_inf) | (op1_inf & op2_zero);
-    bool mult_zero = op1_zero | op2_zero;  // mult_nan has precedence later on
-    int mult_exp_b = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
-    mult_exp_b |= ac_int<E,false>( op1_inf | op2_inf ? -1 : 0 );
-    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
-    if(No_SubNormals)
-      p &= ac_int<2*mu_bits,false>(mult_zero ? 0 : -1);
-    bool mult_inf = op1_inf | op2_inf;
-
-    bool diff_signs = mult_sign ^ op3_sign;
-    bool toggle_r_sign = mult_sign;
-    m_t op3_m = diff_signs ? m_t(-op3_mu) : m_t(op3_mu);
-    unsigned op3_e_b = ac_int<E,false>(op3_e) + !op3_normal;
-
-    int e_dif = mult_exp_b - op3_e_b;
-    bool emult_lt_e3 = e_dif < 0;
-    e_dif = (mult_zero | op3_zero) ? 0 : emult_lt_e3 ? -e_dif : e_dif;
-
-    typedef ac_int<2*mu_bits+4,true> add_t;
-    add_t op3_m_s = op3_m;
-    op3_m_s <<= mu_bits+1;   // mult: ii.ffff, op3: i.ff
-    add_t p_s = p;
-    p_s <<= 2;
-    add_t op_lshift = emult_lt_e3 ? p_s : op3_m_s;
-    add_t op_no_shift = emult_lt_e3 ? op3_m_s : p_s;
-
-    add_t shifted_out_bits = op_lshift;
-    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
-    bool sticky_bit = !!shifted_out_bits;
-
-    op_lshift >>= (unsigned) e_dif;
-    add_t add_r = op_lshift + op_no_shift;
-    int exp = ( (emult_lt_e3 & !op3_zero) | mult_zero ? op3_e_b : mult_exp_b);
-
-    bool all_sign;
-    int ls = add_r.leading_sign(all_sign);
-    // no bit growth of add_r
-    int max_shift_left = exp + (- min_exp - exp_bias + 2);
-    bool shift_exponent_limited = ls >= max_shift_left;
-    int shift_l = shift_exponent_limited ? max_shift_left : ls;
-    add_r <<= shift_l;
-    add_r[0] = add_r[0] | sticky_bit;
-
-    ac_fixed<mu_bits+1,2*mu_bits+4,true,QR> r_rnd = add_r;
-
-    typedef ac_int<mu_bits+1,false> t_h;
-    t_h t = add_r.template slc<mu_bits+1>(mu_bits+2);
-    bool rnd_ovf = QR == AC_RND_CONV && !add_r[2*mu_bits+3] && t == t_h(-1);
-    bool r_neg = r_rnd[mu_bits] ^ rnd_ovf;
-    bool r_sign = op3_inf ? op3_sign : mult_inf ? mult_sign : r_neg ^ toggle_r_sign;
-    ac_int<mu_bits+1,true> r_rnd_i = r_rnd.template slc<mu_bits+1>(0);
-    bool r_zero = !rnd_ovf & !r_rnd_i;
-    bool shift_r = rnd_ovf | (r_neg & !r_rnd_i.template slc<mu_bits>(0));
-    typedef ac_int<mu_bits,false> r_un_t;
-    r_un_t r_un =  r_neg ? (r_un_t) -r_rnd_i : (r_un_t) r_rnd_i;
-    // get rid of implied bit, assign to ac_int
-    bool r_normal = r_un[mant_bits] | shift_r;
-    r_zero |= No_SubNormals & !r_normal;
-    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
-    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 2) + shift_r;
-    bool r_inf = mult_inf | op3_inf | (exp > max_exp + exp_bias);
-    if(QR==AC_TRN_ZERO) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    bool r_nan = op3_nan | mult_nan | ((op3_inf & (op1_inf | op2_inf)) & (op3_sign ^ mult_sign));
-    bool exception = op3_inf | mult_inf | op3_nan | mult_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float fma(const ac_std_float &op2, const ac_std_float &op3) const {
-#ifndef AC_STD_FLOAT_FMA_OVERRIDE
-    return fma_generic<QR,No_SubNormals>(op2,op3);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_FMA_OVERRIDE<QR,No_SubNormals>(*this,op2,op3);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float sqrt_generic() const {
-    ac_private::check_supported<QR>();
-    const bool rnd = QR != AC_TRN_ZERO;   // need msb(rounded bits)
-    const bool rbits = QR != AC_TRN_ZERO; // need bits after msb(rounded bits)
-    e_t op1_e;
-    bool op1_normal, op1_sign, op1_zero;
-    bool op1_inf, op1_nan;
-    mu_t op1_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
-    op1_mu <<= ls_op1;
-    op1_mu[mu_bits-1] = true;  // Since it is normalized, zero is captured by op1_zero
-
-    bool exp_odd = (op1_e  ^ !op1_normal ^ ls_op1 ^ exp_bias) & 1;
-
-    int exp = ac_int<E,false>(op1_e) + !op1_normal - ls_op1 - exp_bias;
-    exp >>= 1;   // divide by 2, truncate towards -inf
-
-    ac_int<mu_bits+1,false> op1_mi = op1_mu;
-    op1_mi <<= exp_odd;
-    ac_int<mu_bits+rnd,false> sq_rt;
-    bool sticky_bit = ac::fx_sqrt(op1_mi, sq_rt);
-    bool r_normal = true;  // true for most practical cases on W,E
-    if(mant_bits > -min_exp) {
-      int exp_over = min_exp - exp;
-      if(exp_over > 0) {
-        if(rbits) {
-          typedef ac_int<mu_bits+rnd,false> t_t;
-          t_t shifted_out_bits = sq_rt;
-          shifted_out_bits &= ~((~t_t(0)) << exp_over);
-          sticky_bit |= !!shifted_out_bits;
-        }
-        sq_rt >>= exp_over;
-        exp = min_exp;
-        r_normal = false;
-      }
-    }
-    // rounding should not trigger overflow (unless truncate towards +inf which is currently not supported)
-    ac_fixed<mu_bits+rnd+rbits,1,false> sq_rt_rnd = 0;
-    if(rbits)
-      sq_rt_rnd[0] = sq_rt_rnd[0] | sticky_bit;
-    sq_rt_rnd.set_slc(rbits, sq_rt);
-    ac_fixed<mu_bits,1,false,QR> sq_rt_fx = sq_rt_rnd;
-
-    ac_int<mant_bits,false> m_r = sq_rt_fx.template slc<mant_bits>(0);
-    bool r_nan = op1_nan | (op1_sign & !op1_zero);
-    bool r_zero = op1_zero;
-    r_zero |= !r_normal & No_SubNormals;
-    bool r_inf = op1_inf;
-    bool exception = r_nan | r_inf;
-    exp += exp_bias;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float sqrt() const {
-#ifndef AC_STD_FLOAT_SQRT_OVERRIDE
-    return sqrt_generic<QR,No_SubNormals>();
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_SQRT_OVERRIDE<QR,No_SubNormals>(*this);
-#endif
-  }
-  ac_std_float operator +(const ac_std_float &op2) const {
-    return add<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float operator -(const ac_std_float &op2) const {
-    return sub<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float operator *(const ac_std_float &op2) const {
-    return mult<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float operator /(const ac_std_float &op2) const {
-    return div<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float &operator +=(const ac_std_float &op2) {
-    *this = operator +(op2);
-    return *this;
-  }
-  ac_std_float &operator -=(const ac_std_float &op2) {
-    *this = operator -(op2);
-    return *this;
-  }
-  ac_std_float &operator *=(const ac_std_float &op2) {
-    *this = operator *(op2);
-  }
-  ac_std_float &operator /=(const ac_std_float &op2) {
-    *this = operator /(op2);
-    return *this;
-  }
-  bool operator ==(const ac_std_float &op2) const {
-    return ((d == op2.d) && !isnan()) || (operator !() && op2.operator !());
-  }
-  bool operator !=(const ac_std_float &op2) const {
-    return !operator ==(op2);
-  }
-  bool magnitude_lt(const ac_std_float &op2) const {
-    return ac_int<W-1,false>(d) < ac_int<W-1,false>(op2.d);
-  }
-  bool neg() const { return d[W-1]; }
-  bool operator <(const ac_std_float &op2) const {
-    return
-      operator !=(op2) && ( (neg() && !op2.neg()) || (!(neg() ^ op2.neg()) && neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator >=(const ac_std_float &op2) const {
-    return
-      (operator ==(op2) || (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator >(const ac_std_float &op2) const {
-    return
-      operator !=(op2)
-      && ( (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator <=(const ac_std_float &op2) const {
-    return
-      (operator == (op2) || (neg() && !op2.neg()) || (!neg() ^ op2.neg() && neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator !() const { return !ac_int<W-1,false>(d); }
-  ac_std_float operator -() const {
-    ac_std_float r(*this);
-    r.d[W-1] = !d[W-1];
-    return r;
-  }
-  ac_std_float operator +() const {
-    return ac_std_float(*this);
-  }
-  ac_std_float abs() const {
-    ac_std_float r(*this);
-    r.d[W-1] = false;
-    return r;
-  }
-  ac_std_float copysign(const ac_std_float &op2) const {
-    ac_std_float r(*this);
-    r.d[W-1] = op2.d[W-1];
-    return r;
-  }
-  bool signbit() const {
-    return d[W-1];
-  }
-  void set_signbit(bool s) {
-    d[W-1] = s;
-  }
-  ac_std_float ceil() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    bool sign = d[W-1];
-    if(!d.template slc<W-1>(0))
-      return *this;
-    if(e < exp_bias) {
-      return sign ? zero() : one();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        bool non_zero_fractional = !!(m & ~mask);
-        if(!sign) {
-          m |= ~mask;
-          mu_t mu = m + mant_t(non_zero_fractional);
-          e += mu[mant_bits];
-          r.d.set_slc(mant_bits, e);
-          m = mu;
-        }
-        m &= mask;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-  ac_std_float floor() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    bool sign = d[W-1];
-    if(!d.template slc<W-1>(0))
-      return *this;
-    if(e < exp_bias) {
-      return sign ? -one() : zero();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        bool non_zero_fractional = !!(m & ~mask);
-        if(sign) {
-          m |= ~mask;
-          mu_t mu = m + mant_t(non_zero_fractional);
-          e += mu[mant_bits];
-          r.d.set_slc(mant_bits, e);
-          m = mu;
-        }
-        m &= mask;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-  ac_std_float trunc() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    if(e < exp_bias) {
-      return zero();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        m &= mask;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-  ac_std_float round() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    if(e < exp_bias-1) {
-      return zero();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias -1 - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        m |= ~mask;
-        mu_t mu = m + mant_t(1);
-        e += mu[mant_bits];
-        r.d.set_slc(mant_bits, e);
-        m = mu;
-        m &= mask << 1;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-};
-
-template<int W, int E>
-inline std::ostream& operator << (std::ostream &os, const ac_std_float<W,E> &x) {
-  // for now just print the raw ac_int for it
-  os << x.data().to_string(AC_HEX);
-  return os;
-}
-
-namespace ac {
-  // Type punning: using memcpy to avoid strict aliasing
-  inline void copy_bits(float f, int *x) {
-    std::memcpy(x, &f, sizeof(int));
-  }
-  inline void copy_bits(double f, long long *x) {
-    std::memcpy(x, &f, sizeof(long long));
-  }
-  inline void copy_bits(int x, float *f) {
-    std::memcpy(f, &x, sizeof(float));
-  }
-  inline void copy_bits(long long x, double *f) {
-    std::memcpy(f, &x, sizeof(double));
-  }
-
-  inline void copy_bits(const ac_std_float<32,8> &x, float *f) {
-    copy_bits(x.data().to_int(), f);
-  }
-  inline void copy_bits(const ac_std_float<64,11> &x, double *f) {
-    copy_bits(x.data().to_int64(), f);
-  }
-}
-
-template<ac_ieee_float_format Format>
-class ac_ieee_float_base {
-public:
-  static const int width = 1 << ((int)Format + 4);
-  // exponents are {5,8,11,15,19}, but the first three are specialized elsewhere
-  static const int e_width = 11 + ((int)Format - binary64)*4; // 11, 15, 19
-  static const int lls = width >> 6;
-  typedef long long (data_t)[lls];
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-  typedef ac_std_float<width,e_width> helper_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) {
-    ac::copy_bits(f.d, &d);
-  }
-  explicit ac_ieee_float_base(const helper_t &op) {
-    ac::copy_bits(op.data(), &d);
-  }
-  explicit ac_ieee_float_base(double f);
-protected:
-  helper_t to_helper_t() const {
-    ac_int<width,true> dat;
-    ac::copy_bits(d, &dat);
-    helper_t x;
-    x.set_data(dat);
-    return x;
-  }
-public:
-  void set_data(const data_t &op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const { return d[lls-1] < 0; }
-  void set_signbit(bool s) {
-    ac_int<64,true> t(d[lls-1]);
-    t[63] = s;
-    d[lls-1] = t.to_int64();
-  }
-};
-
-template<ac_ieee_float_format Format>
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<Format> &x) {
-  // for now print the 128 and 256 as raw ac_int
-  os << x.data_ac_int().to_string(AC_HEX);
-  return os;
-}
-
-template<> class ac_ieee_float_base<binary16> {
-public:
-  static const int width = 16;
-  static const int e_width = 5;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-  typedef short data_t;
-  typedef ac_std_float<width,e_width> helper_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
-  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
-  explicit ac_ieee_float_base(float f) : d((short)ac_std_float<width,e_width>(f).data().to_int()) {}
-protected:
-  helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-  float to_float() const {
-    ac_std_float_t t;
-    t.set_data(this->data_ac_int());
-    return t.to_float();
-  }
-#if __cplusplus > 199711L
-  explicit operator float() const { return to_float(); }
-#endif
-  void set_data(short op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const { return d < 0; }
-  void set_signbit(bool s) {
-    ac_int<width,true> t(d);
-    t[width-1] = s;
-    d = t;
-  }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary16> &x) {
-  os << x.to_float();
-  return os;
-}
-
-struct float_helper {
-  float d;
-  float_helper() {}
-  float_helper(float f) { d = f; }
-  float_helper(const float_helper &f) { d = f.d; }
-  float_helper(const float_helper &f, bool no_subnormals) {
-    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
-  }
-  float_helper(const ac_std_float<32,8> &f) { set_data(f.data().to_int()); }
-  template<ac_q_mode Q>
-  float_helper(const ac_float<25,2,8,Q> &f) : d(f.to_float()) {}
-  const float &data() const { return d; }
-  void set_data(int data) { ac::copy_bits(data, &d); }
-  void set_data(float data) { d = data; }
-  operator float() const { return d; }
-  float to_float() const { return d; }
-  int fpclassify() const { return std::fpclassify(d); }
-  bool isfinite() const { return std::isfinite(d); }
-  bool isnormal() const { return std::isnormal(d); }
-  bool isinf() const { return std::isinf(d); }
-  bool isnan() const { return std::isnan(d); }
-  static float nan() { return ac_std_float<32,8>::nan().to_float(); }
-  static float inf() { return ac_std_float<32,8>::inf().to_float(); }
-  static float denorm_min() { return ac_std_float<32,8>::denorm_min().to_float(); }
-  static float min() { return ac_std_float<32,8>::min().to_float(); }
-  static float max() { return ac_std_float<32,8>::max().to_float(); }
-  static float epsilon() { return ac_std_float<32,8>::epsilon().to_float(); }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper add(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) + float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper sub(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) - float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper mult(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) * float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper div(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) / float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper fma(const float_helper &op2, const float_helper &op3) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( ::fmaf(float_helper(*this, No_SubNormals), float_helper(op2, No_SubNormals), float_helper(op3, No_SubNormals)), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper sqrt() const {
-    ac_private::check_supported2<QR>();
-    return float_helper( ::sqrtf(float_helper(*this, No_SubNormals)), No_SubNormals);
-  }
-  float_helper ceil() const { return float_helper(std::ceil(d)); }
-  float_helper floor() const { return float_helper(std::floor(d)); }
-  float_helper trunc() const { return float_helper(::truncf(d)); }
-  float_helper round() const { return float_helper(::roundf(d)); }
-};
-
-template<> class ac_ieee_float_base<binary32> {
-public:
-  static const int width = 32;
-  static const int e_width = 8;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-  typedef float data_t;
-  typedef float_helper helper_t;
-#else
-  typedef int data_t;
-  typedef ac_std_float<width,e_width> helper_t;
-#endif
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
-  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
-  explicit ac_ieee_float_base(float f) { ac::copy_bits(f, &d); }
-protected:
-  helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-#if __cplusplus > 199711L
-  explicit operator float() const {
-    float f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-#endif
-  float to_float() const {
-    float f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-  void set_data(int op) { ac::copy_bits(op, &d); }
-  void set_data(float op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const {
-    int x; ac::copy_bits(d, &x);
-    return x < 0;
-  }
-  void set_signbit(bool s) {
-    ac_int<width,true> t;
-    ac::copy_bits(d, &t);
-    t[width-1] = s;
-    ac::copy_bits(t, &d);
-  }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary32> &x) {
-  os << x.to_float();
-  return os;
-}
-
-struct double_helper {
-  double d;
-  double_helper() {}
-  double_helper(double f) { d = f; }
-  double_helper(const float_helper &f) { d = f.d; }
-  double_helper(const double_helper &f, bool no_subnormals) {
-    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
-  }
-  double_helper(const ac_std_float<64,11> &f) { set_data(f.data().to_int64()); }
-  template<ac_q_mode Q>
-  double_helper(const ac_float<54,2,11,Q> &f) : d(f.to_double()) {}
-  const double &data() const { return d; }
-  void set_data(long long data) {
-    ac::copy_bits(data, &d);
-  }
-  void set_data(double data) { d = data; }
-  operator double() const { return d; }
-  double to_double() const { return d; }
-  int fpclassify() const { return std::fpclassify(d); }
-  bool isfinite() const { return std::isfinite(d); }
-  bool isnormal() const { return std::isnormal(d); }
-  bool isinf() const { return std::isinf(d); }
-  bool isnan() const { return std::isnan(d); }
-  static double nan() { return ac_std_float<64,11>::nan().to_double(); }
-  static double inf() { return ac_std_float<64,11>::inf().to_double(); }
-  static double denorm_min() { return ac_std_float<64,11>::denorm_min().to_double(); }
-  static double min() { return ac_std_float<64,11>::min().to_double(); }
-  static double max() { return ac_std_float<64,11>::max().to_double(); }
-  static double epsilon() { return ac_std_float<64,11>::epsilon().to_double(); }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper add(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) + double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper sub(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) - double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper mult(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) * double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper div(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) / double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper fma(const double_helper &op2, const double_helper &op3) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( ::fma((double) double_helper(*this, No_SubNormals), (double) double_helper(op2, No_SubNormals), (double) double_helper(op3, No_SubNormals)), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper sqrt() const {
-    ac_private::check_supported2<QR>();
-    return double_helper( ::sqrt((double) double_helper(*this, No_SubNormals)), No_SubNormals);
-  }
-  double_helper ceil() const { return double_helper(std::ceil(d)); }
-  double_helper floor() const { return double_helper(std::floor(d)); }
-  double_helper trunc() const { return double_helper(::trunc(d)); }
-  double_helper round() const { return double_helper(::round(d)); }
-};
-
-template<> class ac_ieee_float_base<binary64> {
-public:
-  static const int width = 64;
-  static const int e_width = 11;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-  typedef double data_t;
-  typedef double_helper helper_t;
-#else
-  typedef long long data_t;
-  typedef ac_std_float<width,e_width> helper_t;
-#endif
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
-  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
-  explicit ac_ieee_float_base(double f) { ac::copy_bits(f, &d); }
-protected:
-  helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-#if __cplusplus > 199711L
-  explicit operator double() const {
-    double f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-#endif
-  double to_double() const {
-    double f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-  void set_data(long long op) { ac::copy_bits(op, &d); }
-  void set_data(double op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const {
-    long long x; ac::copy_bits(d, &x);
-    return x < 0;
-  }
-  void set_signbit(bool s) {
-    ac_int<width,true> t;
-    ac::copy_bits(d, &t);
-    t[width-1] = s;
-    ac::copy_bits(t, &d);
-  }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary64> &x) {
-  os << x.to_double();
-  return os;
-}
-
-namespace ac_private {
-  template<ac_ieee_float_format Format, typename T2>
-  struct ac_ieee_float_constructor {};
-  template<> struct ac_ieee_float_constructor<binary16,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary16,double> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary32,float> {
-    typedef int type;
-  };
-  template<> struct ac_ieee_float_constructor<binary32,double> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary64,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary64,double> {
-    typedef int type;
-  };
-  template<> struct ac_ieee_float_constructor<binary128,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary128,double> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary256,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary256,double> {
-    typedef int type_explicit;
-  };
-}
-
-template<ac_ieee_float_format Format>
-class ac_ieee_float : public ac_ieee_float_base<Format> {
-public:
-  typedef ac_ieee_float_base<Format> Base;
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type mult;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type plus;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus2;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type logic;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div2;
-  };
-  struct rt_unary {
-    typedef ac_ieee_float neg;
-    typedef ac_ieee_float mag_sqr;
-    typedef ac_ieee_float mag;
-  };
-  static const int width = Base::width;
-  static const int e_width = Base::e_width;
-  static const int lls = width >> 6;
-  typedef typename Base::data_t data_t;
-  typedef typename Base::helper_t helper_t;
-  typedef typename Base::ac_float_t ac_float_t;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-public:
-  static ac_ieee_float nan() { return ac_ieee_float(helper_t::nan()); }
-  static ac_ieee_float inf() { return ac_ieee_float(helper_t::inf()); }
-  static ac_ieee_float denorm_min() { return ac_ieee_float(helper_t::denorm_min()); }
-  static ac_ieee_float min() { return ac_ieee_float(helper_t::min()); }
-  static ac_ieee_float max() { return ac_ieee_float(helper_t::max()); }
-  static ac_ieee_float epsilon() { return ac_ieee_float(helper_t::epsilon()); }
-  static ac_ieee_float zero() { return ac_ieee_float(ac_std_float_t::zero()); }
-  static ac_ieee_float one() { return ac_ieee_float(ac_std_float_t::one()); }
-  ac_ieee_float() {}
-private:
-  ac_ieee_float(const Base &f) : Base(f) {}
-public:
-  ac_ieee_float(const ac_std_float<width,e_width> &f) : Base(f) {}
-  ac_ieee_float(const ac_ieee_float &f) : Base(f) {}
-  template<ac_ieee_float_format Format2>
-  explicit ac_ieee_float(const ac_ieee_float<Format2> &f) : Base(ac_std_float_t(f.to_ac_std_float())) {}
-  template<int W, int E>
-  explicit ac_ieee_float(const ac_std_float<W,E> &f) : Base(ac_std_float_t(f)) {}
-  explicit ac_ieee_float(const ac::bfloat16 &f);
-  explicit ac_ieee_float(const ac_float_t &f) : Base(ac_std_float_t(f)) {}
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  explicit ac_ieee_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) : Base(ac_std_float_t(fx)) {}
-  template<ac_q_mode Q>
-  explicit ac_ieee_float(const ac_float<width-e_width+1,2,e_width,Q> &f) : Base(ac_std_float_t(f)) {}
-  template<ac_ieee_float_format Format2>
-  ac_ieee_float<Format2> to_ac_ieee_float() const { return ac_ieee_float<Format2>(*this); }
-  const ac_float_t to_ac_float() const {
-    return to_ac_std_float().to_ac_float();
-  }
-  const ac_std_float<width,e_width> to_ac_std_float() const {
-    ac_std_float<width,e_width> r;
-    r.set_data(data_ac_int());
-    return r;
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
-    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
-  }
-  void set_data(const data_t &data) {
-    Base::set_data(data);
-  }
-  const ac_int<width,true> data_ac_int() const { return Base::data_ac_int(); }
-  const data_t &data() const { return Base::d; }
-  template<typename T>
-  ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type d = 0) : Base(ac_std_float_t(f)) {}
-  template<typename T>
-  explicit ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type_explicit d = 0) : Base(ac_std_float_t(f)) {}
-  explicit ac_ieee_float(int x) {
-    *this = ac_ieee_float(ac_fixed<32,32,true>(x));
-  }
-  explicit ac_ieee_float(long long x) {
-    *this = ac_ieee_float(ac_fixed<64,64,true>(x));
-  }
-  int fpclassify() const { return Base::to_helper_t().fpclassify(); }
-  bool isfinite() const { return Base::to_helper_t().isfinite(); }
-  bool isnormal() const { return Base::to_helper_t().isnormal(); }
-  bool isinf() const { return Base::to_helper_t().isinf(); }
-  bool isnan() const { return Base::to_helper_t().isnan(); }
-
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float add(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template add<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float sub(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template sub<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float mult(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template mult<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float div(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template div<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float fma(const ac_ieee_float &op2, const ac_ieee_float &op3) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template fma<QR,No_SubNormals>(op2.Base::to_helper_t(), op3.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float sqrt() const {
-    return ac_ieee_float(Base(Base::to_helper_t().template sqrt<QR,No_SubNormals>()));
-  }
-
-  ac_ieee_float operator +(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() + op2.Base::to_helper_t()));
-  }
-  ac_ieee_float operator -(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() - op2.Base::to_helper_t()));
-  }
-  ac_ieee_float operator *(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() * op2.Base::to_helper_t()));
-  }
-  ac_ieee_float operator /(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() / op2.Base::to_helper_t()));
-  }
-
-  ac_ieee_float &operator +=(const ac_ieee_float &op2) {
-    return *this = operator +(op2);
-  }
-  ac_ieee_float &operator -=(const ac_ieee_float &op2) {
-    return *this = operator -(op2);
-  }
-  ac_ieee_float &operator *=(const ac_ieee_float &op2) {
-    return *this = operator *(op2);
-  }
-  ac_ieee_float &operator /=(const ac_ieee_float &op2) {
-    return *this = operator /(op2);
-  }
-
-  bool operator ==(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() == op2.Base::to_helper_t();
-  }
-  bool operator !=(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() != op2.Base::to_helper_t();
-  }
-  bool operator <(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() < op2.Base::to_helper_t();
-  }
-  bool operator >=(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() >= op2.Base::to_helper_t();
-  }
-  bool operator >(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() > op2.Base::to_helper_t();
-  }
-  bool operator <=(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() <= op2.Base::to_helper_t();
-  }
-
-  ac_ieee_float operator -() const {
-    ac_ieee_float r(*this);
-    r.set_signbit(!this->signbit());
-    return r;
-  }
-  ac_ieee_float operator +() const {
-    return ac_ieee_float(*this);
-  }
-  ac_ieee_float abs() const {
-    ac_ieee_float r(*this);
-    r.set_signbit(false);
-    return r;
-  }
-  ac_ieee_float copysign(const ac_ieee_float &op2) const {
-    ac_ieee_float r(*this);
-    r.set_signbit(this->signbit());
-    return r;
-  }
-  bool signbit() const { return Base::signbit(); }
-  ac_ieee_float add(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 + op2;
-  }
-  ac_ieee_float ceil() const {
-    return ac_ieee_float(Base(Base::to_helper_t().ceil()));
-  }
-  ac_ieee_float floor() const {
-    return ac_ieee_float(Base(Base::to_helper_t().floor()));
-  }
-  ac_ieee_float trunc() const {
-    return ac_ieee_float(Base(Base::to_helper_t().trunc()));
-  }
-  ac_ieee_float round() const {
-    return ac_ieee_float(Base(Base::to_helper_t().round()));
-  }
-  ac_ieee_float sub(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 - op2;
-  }
-  ac_ieee_float mult(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 * op2;
-  }
-  ac_ieee_float div(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 / op2;
-  }
-};
-
-template<ac_ieee_float_format Format>
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float<Format> &x) {
-  os << (const ac_ieee_float_base<Format>&) x;
-  return os;
-}
-
-namespace ac {
-class bfloat16 {
-public:
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type mult;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type plus;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus2;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type logic;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div2;
-  };
-  struct rt_unary {
-    typedef bfloat16 neg;
-    typedef bfloat16 mag_sqr;
-    typedef bfloat16 mag;
-  };
-  static const int width = 16;
-  static const int e_width = 8;
-  static bfloat16 nan() { return bfloat16(helper_t::nan()); }
-  static bfloat16 inf() { return bfloat16(helper_t::inf()); }
-  static bfloat16 denorm_min() { return bfloat16(helper_t::denorm_min()); }
-  static bfloat16 min() { return bfloat16(helper_t::min()); }
-  static bfloat16 max() { return bfloat16(helper_t::max()); }
-  static bfloat16 epsilon() { return bfloat16(helper_t::epsilon()); }
-  static bfloat16 zero() { return bfloat16(ac_std_float_t::zero()); }
-  static bfloat16 one() { return bfloat16(ac_std_float_t::one()); }
-  typedef ac_std_float<width,e_width> helper_t;
-  typedef short data_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-  data_t d;
-  bfloat16() {}
-  bfloat16(const bfloat16 &f) : d(f.d) {}
-  bfloat16(const ac_std_float_t &op) : d(op.data()) {}
-  bfloat16(float f) { int x; ac::copy_bits(f, &x); d = (short) (x >> 16); }
-  template<int W2>
-  explicit bfloat16(const ac_std_float<W2,e_width> &f) {
-    *this = f.template convert<width,AC_TRN_ZERO>();
-  }
-  template<int W2,int E2>
-  explicit bfloat16(const ac_std_float<W2,E2> &f) {
-    *this = f.template convert<width,e_width,AC_TRN_ZERO>();
-  }
-  template<ac_ieee_float_format Format>
-  explicit bfloat16(const ac_ieee_float<Format> &f) {
-    *this = f.to_ac_std_float().template convert<width,e_width,AC_TRN_ZERO>();
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  explicit bfloat16(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
-    ac_std_float_t x;
-    x.assign_from<AC_TRN_ZERO>(fx);
-    *this = x;
-  }
-private:
-  const helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-  const ac_std_float_t to_ac_std_float() const {
-    ac_std_float_t x;
-    x.set_data(d);
-    return x;
-  }
-  const ac_float_t to_ac_float() const {
-    return ac_std_float_t().to_ac_float();
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
-    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
-  }
-  float to_float() const {
-    return to_ac_std_float().to_float();
-  }
-  double to_double() const {
-    return to_ac_std_float().to_double();
-  }
-  // operator is efficient since E is identical and mantissa is longer
-#if __cplusplus > 199711L
-  explicit operator float() const { return to_float(); }
-#endif
-  int fpclassify() const { return to_helper_t().fpclassify(); }
-  bool isfinite() const { return to_helper_t().isfinite(); }
-  bool isnormal() const { return to_helper_t().isnormal(); }
-  bool isinf() const { return to_helper_t().isinf(); }
-  bool isnan() const { return to_helper_t().isnan(); }
-  void set_data(short op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<16,true> data_ac_int() const { return ac_int<16,true>(d); }
-
-  // mirroed most constructors in tensorflow implementation (except template version)
-  //   tensorflow uses static_cast<float>
-  //   this implementation goes through ac_std_float so there is no dependency on rounding mode
-//  template <class T>
-//  explicit bfloat16(const T& val) { *this = bfloat16(static_cast<float>(val)); }
-  explicit bfloat16(unsigned short val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<16,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(int val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<32,true>(val) );
-    *this = t;
-  }
-  explicit bfloat16(unsigned int val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<32,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(long val) {
-    const int long_w = ac_private::long_w;
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<long_w,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(long long val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<64,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(double val) { *this = bfloat16(ac_ieee_float<binary64>(val)); }
-
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 add(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().add<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 sub(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().sub<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 mult(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().mult<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 div(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().div<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 fma(const bfloat16 &op2, const bfloat16 &op3) const {
-    return bfloat16(to_helper_t().fma<QR,No_SubNormals>(op2.to_helper_t(), op3.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 sqrt() const {
-    return bfloat16(to_helper_t().sqrt<QR,No_SubNormals>());
-  }
-
-  bfloat16 operator +(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().add<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 operator -(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().sub<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 operator *(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().mult<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 operator /(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().div<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 &operator +=(const bfloat16 &op2) {
-    return *this = operator +(op2);
-  }
-  bfloat16 &operator -=(const bfloat16 &op2) {
-    return *this = operator -(op2);
-  }
-  bfloat16 &operator *=(const bfloat16 &op2) {
-    return *this = operator *(op2);
-  }
-  bfloat16 &operator /=(const bfloat16 &op2) {
-    return *this = operator /(op2);
-  }
-
-  bool operator ==(const bfloat16 &op2) const {
-    return to_helper_t() == op2.to_helper_t();
-  }
-  bool operator !=(const bfloat16 &op2) const {
-    return to_helper_t() != op2.to_helper_t();
-  }
-  bool operator <(const bfloat16 &op2) const {
-    return to_helper_t() < op2.to_helper_t();
-  }
-  bool operator >=(const bfloat16 &op2) const {
-    return to_helper_t() >= op2.to_helper_t();
-  }
-  bool operator >(const bfloat16 &op2) const {
-    return to_helper_t() > op2.to_helper_t();
-  }
-  bool operator <=(const bfloat16 &op2) const {
-    return to_helper_t() <= op2.to_helper_t();
-  }
-
-  bfloat16 operator -() const {
-    bfloat16 r(*this);
-    r.set_signbit(!this->signbit());
-    return r;
-  }
-  bfloat16 operator +() const {
-    return bfloat16(*this);
-  }
-  bfloat16 abs() const {
-    bfloat16 r(*this);
-    r.set_signbit(false);
-    return r;
-  }
-  bfloat16 copysign(const bfloat16 &op2) const {
-    bfloat16 r(*this);
-    r.set_signbit(this->signbit());
-    return r;
-  }
-  bool signbit() const { return d < 0; }
-  void set_signbit(bool s) {
-    ac_int<width,true> t(d);
-    t[width-1] = s;
-    d = t;
-  }
-  bfloat16 ceil() const { return to_helper_t().ceil(); }
-  bfloat16 floor() const { return to_helper_t().floor(); }
-  bfloat16 trunc() const { return to_helper_t().trunc(); }
-  bfloat16 round() const { return to_helper_t().round(); }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac::bfloat16 &x) {
-  os << x.to_float();
-  return os;
-}
-
-}
-
-template<int W, int E>
-template<ac_ieee_float_format Format>
-inline ac_std_float<W,E>::ac_std_float(const ac_ieee_float<Format> &f) {
-  *this = ac_std_float(f.to_ac_std_float());
-}
-
-template<int W, int E>
-inline ac_std_float<W,E>::ac_std_float(const ac::bfloat16 &f) {
-  *this = ac_std_float(f.to_ac_std_float());
-}
-
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format>::ac_ieee_float(const ac::bfloat16 &f) {
-  *this = ac_ieee_float(f.to_ac_std_float());
-}
-
-typedef ac_ieee_float<binary16> ac_ieee_float16;
-typedef ac_ieee_float<binary32> ac_ieee_float32;
-typedef ac_ieee_float<binary64> ac_ieee_float64;
-typedef ac_ieee_float<binary128> ac_ieee_float128;
-typedef ac_ieee_float<binary256> ac_ieee_float256;
-
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-// Global functions for ac_ieee_float
-namespace std {
-#ifdef __AC_NAMESPACE
-using namespace __AC_NAMESPACE;
-#endif
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format> abs(const ac_ieee_float<Format> &x) { return x.abs(); }
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format> fabs(const ac_ieee_float<Format> &x) { return x.abs(); }
-
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format> copysign(const ac_ieee_float<Format> &x, const ac_ieee_float<Format> &y) { return x.copysign(y); }
-
-template<ac_ieee_float_format Format>
-inline int fpclassify(const ac_ieee_float<Format> &x) { return x.fpclassify(); }
-template<ac_ieee_float_format Format>
-inline bool isfinite(const ac_ieee_float<Format> &x) { return x.isfinite(); }
-template<ac_ieee_float_format Format>
-inline bool isnormal(const ac_ieee_float<Format> &x) { return x.isnormal(); }
-template<ac_ieee_float_format Format>
-inline bool isinf(const ac_ieee_float<Format> &x) { return x.isinf(); }
-template<ac_ieee_float_format Format>
-inline bool isnan(const ac_ieee_float<Format> &x) { return x.isnan(); }
-
-// Don't do "long double" versions since they are 80-bits, it is an extended presicion
-// TODO: fmod, fmodf, fmodl
-// TODO: fmod, remainder, remquo, fma, fmax, fmin, fdim
-// remainder(x,y),  x - n*y, where n = x/y rounded to the nearest integer (RND_CONV)
-// remquo(x,y, int *quo),  returns same as remainder, unclear what quo is, also Nan, inf etc
-// fmax, fmin:  if one number is Nan, the other is returned
-// fdim(x,y) returns max(x-y,0), if x or y is NaN, a NaN is returned, if result overflows, HUGE_VAL is returned
-// TODO: ceil, floor, trunc, round, lround, nearbyint, rint, lrint, llround, llrint
-// if x is +0, -0, NaN or Inf, x is returned
-//   ceil(x), floor(x), trunc(x)
-//   round(x) : RND_INF
-//   nearbyint: depends on rounding mode
-//   rint, same as nearbyint, but may raise inexaxt exception (FE_INEXACT)
-// TODO: frexp, ldexp, modf, nextafter, nexttoward, copysign
-// modf(x, *iptr), modff   break into integral (*iptr) and fractional (returned) values,
-// Don't cause exception: isgreater, isgreaterequal, isless, islessequal, islessgreater, isunordered
-//  isunordered: x or y is NaN
-template<ac_ieee_float_format Format>
-inline bool signbit(const ac_ieee_float<Format> &x) { return x.signbit(); }
-
-// Global functions for bfloat16
-inline bool signbit(const ac::bfloat16 &x) { return x.signbit(); }
-
-inline int fpclassify(const ac::bfloat16 &x) { return x.fpclassify(); }
-inline bool isfinite(const ac::bfloat16 &x) { return x.isfinite(); }
-inline bool isnormal(const ac::bfloat16 &x) { return x.isnormal(); }
-inline bool isinf(const ac::bfloat16 &x) { return x.isinf(); }
-inline bool isnan(const ac::bfloat16 &x) { return x.isnan(); }
-}
-
-#undef __AC_DATA_PRIVATE
-#undef AC_STD_FLOAT_FX_DIV_OVERRIDE
-
-#endif
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2018-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*  Source:         ac_std_float.h
+ *  Description:    class for floating point operation handling in C++
+ *  Author:         Andres Takach, Ph.D.
+
+Overview: this header defines three classes
+
+  ac_ieee_float<Format>
+    Meant to store floats in IEEE standard binary format
+    Format indicate width:
+      binary16: (half float) uses short
+      binary32: (float) uses int
+      binary64: (double) uses array of long long with one element
+      binary128: (long double in some platforms) uses array of long long with two elements
+      binary256: uses array of long long with four elements
+
+  ac::bfloat16
+    Implements Google's tensorflow::bfloat16
+    Stores data as "short"
+
+  ac_std_float<W,E>
+    Superset of ac_ieee_float in that any bit width and exponent width is
+      allowed
+    This is used by ac_ieee_float and ac::bfloat16
+
+    Uses an ac_int<W,true> that holds the bit pattern for a standard (IEEE) style binary
+    float:
+         1) sign-magnitude representation, sign is MSB
+         2) mantissa (significand) with implied bit for normal numbers
+         3) E is not restricted to IEEE widths, another class ac_ieee_float does that
+
+    Provides easy way to conver to/from the closest covering ac_float:
+      Constructor from ac_float
+        Most two negative exponents of ac_float are not representable: shift
+          significand futher to the right (for now no attempt to round)
+        Most negative mantissa of ac_float (in two's complement) when converted
+          to sign-magnitute requires a right shift (add +1 to exponent)
+          If exponent is already max, two alternatives:
+            - "saturate" (store most negative number)
+            - Store as -Inf  (currently this option not available)
+        Exponent is offset
+        Mantissa implied bit is removed from normal numbers
+
+      Explicit convertion to_ac_float
+        Ignores exceptions (Inf, NaN)
+        Does inverse as above to obtain ac_float
+*/
+
+#ifndef __AC_STD_FLOAT_H
+#define __AC_STD_FLOAT_H
+#include <ac_float.h>
+#include <cstring>
+// Inclusion of cmath undefs all macros such as signbit etc that some parsers may define for C
+#include <cmath>
+
+#ifdef __SYNTHESIS__
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+#undef AC_IEEE_FLOAT_USE_BUILTIN
+#endif
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+// For now make data members public since SCVerify needs it
+//#ifdef __AC_MAKE_PRIVATE_DATA_PUBLIC
+#if 1
+#define __AC_DATA_PRIVATE public:
+#else
+#define __AC_DATA_PRIVATE private:
+#endif
+
+namespace ac_private {
+  template<bool cond>
+  struct check_rounding { enum {Only_symmetrical_roundings_or_truncations_supported}; };
+  template<> struct check_rounding<false> {};
+
+  template<ac_q_mode Q>
+  void check_supported() {
+    // only symmetrical roundings supported
+    const bool supported = Q==AC_RND_CONV || Q==AC_TRN_ZERO || Q==AC_RND_INF || Q == AC_RND_CONV_ODD;
+#if __cplusplus > 199711L
+    static_assert(supported, "Only symmetrical roundings/truncations supported");
+#else
+    (void) check_rounding<supported>::Only_symmetrical_roundings_or_truncations_supported;
+#endif
+  }
+
+  template<bool cond>
+  struct check_rounding2 { enum {Only_round_to_even_supported_when_using_BUILTIN}; };
+  template<> struct check_rounding2<false> {};
+
+  template<ac_q_mode Q>
+  void check_supported2() {
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+    const bool supported = Q==AC_RND_CONV;
+#if __cplusplus > 199711L
+    static_assert(supported, "Only round to even supported");
+#else
+    (void) check_rounding2<supported>::Only_round_to_even_supported_when_using_BUILTIN;
+#endif
+#endif
+  }
+
+  template<typename T, typename T2>
+  struct rt_closed_T {
+  };
+  template<typename T>
+  struct rt_closed_T<T,T> {
+    typedef T type;
+  };
+}
+
+namespace ac {
+  #pragma hls_design ccore
+  #pragma hls_ccore_type sequential
+  template<int W>
+  void fx_div(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
+    ac_int<W+2,true> R = op1;
+    bool R_neg = false;
+    ac_int<W,false> D = op2;
+    ac_int<W+1,true> neg_D = -D;
+    ac_int<W+2,false> Q = 0;
+    for(int i=0; i < W+2; i++) {
+      // take MSB of N, shift it in from right to R
+      R += ( R_neg ? (ac_int<W+1,true>) D : neg_D );
+      Q = (Q << 1) | ((R >= 0) & 1);
+      R_neg = R[W];
+      R <<= 1;
+    }
+    quotient = Q;
+    exact = !R | R_neg & (R >> 1) == neg_D;
+  }
+
+  template<int W>
+  void fx_div_sim(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
+    // need to compute extra rnd bit,
+    //   +2 because we may need to shift left by 1 (mant divisor > mant dividend)
+    ac_int<2*W+1,false> op1_mi = op1;
+    op1_mi <<= W+1;
+    // +1 bit to compute rnd bit
+    quotient = (op1_mi / op2);
+    exact = !(op1_mi % op2);
+  }
+
+  #pragma hls_design ccore
+  #pragma hls_ccore_type sequential
+  template<int W, int WR>
+  bool fx_sqrt( ac_int<W,false> x, ac_int<WR,false> &sqrt) {
+    // x is ac_fixed<W,2,false>, sqrt is ac_fixed<WR,1,false>
+    const bool W_odd = W&1;
+    const int ZW = W + W_odd;  // make it even
+    ac_int<ZW,false> z = x;
+    z <<= W_odd;
+    // masks used only to hint synthesis on precision
+    ac_int<WR+2,false> mask_d = 0;
+    ac_int<WR+2,false> d = 0;
+    ac_int<WR,false> r = 0;
+    unsigned int z_shift = ZW-2;
+    for(int i = WR-1; i >= 0; i--) {
+      r <<= 1;
+      mask_d = (mask_d << 2) | 0x3;
+      d = (mask_d & (d << 2)) | ((z >> z_shift) & 0x3 );
+      ac_int<WR+2,false> t = d - (( ((ac_int<WR+1,false>)r) << 1) | 0x1);
+      if( !t[WR+1] ) {  // since t is unsigned, look at MSB
+        r |= 0x1;
+        d = mask_d & t;
+      }
+      z <<= 2;
+    }
+
+    bool rem = (d != 0) || ((z >> 2*W) != 0);
+    sqrt = r;
+    return rem;
+  }
+}
+
+#ifndef AC_STD_FLOAT_FX_DIV_OVERRIDE
+#ifdef __SYNTHESIS__
+#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div
+#else
+#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div_sim
+#endif
+#endif
+
+template<int W, int E> class ac_std_float;
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
+#define AC_STD_FLOAT_OVERRIDE_NS ::AC_STD_FLOAT_OVERRIDE_NAMESPACE::
+namespace AC_STD_FLOAT_OVERRIDE_NAMESPACE {
+#ifdef __AC_NAMESPACE
+  using __AC_NAMESPACE::ac_q_mode;
+  using __AC_NAMESPACE::ac_std_float;
+#endif
+#else
+#define AC_STD_FLOAT_OVERRIDE_NS
+#endif
+
+#ifdef AC_STD_FLOAT_ADD_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_ADD_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
+#endif
+
+#ifdef AC_STD_FLOAT_MULT_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_MULT_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
+#endif
+
+#ifdef AC_STD_FLOAT_DIV_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_DIV_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
+#endif
+
+#ifdef AC_STD_FLOAT_FMA_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_FMA_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2, const ac_std_float<W,E> &op3);
+#endif
+
+#ifdef AC_STD_FLOAT_SQRT_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_SQRT_OVERRIDE(const ac_std_float<W,E> &op);
+#endif
+
+#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
+}
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+namespace ac {
+  inline void copy_bits(float a, float *b) { *b = a; }
+  inline void copy_bits(double a, double *b) { *b = a; }
+
+  inline void copy_bits(short a, short *b) { *b = a; }
+  inline void copy_bits(const ac_int<16,true> &a, short *b) { *b = (short) a.to_int(); }
+  inline void copy_bits(short a, ac_int<16,true> *b) { *b = ac_int<16,true>(a); }
+  inline void copy_bits(int a, int *b) { *b = a; }
+  inline void copy_bits(const ac_int<32,true> &a, int *b) { *b = a.to_int(); }
+  inline void copy_bits(int a, ac_int<32,true> *b) { *b = ac_int<32,true>(a); }
+  inline void copy_bits(long long a, long long *b) { *b = a; }
+  inline void copy_bits(const ac_int<64,true> &a, long long *b) { *b = a.to_int64(); }
+  inline void copy_bits(long long a, ac_int<64,true> *b) { *b = ac_int<64,true>(a); }
+  inline void copy_bits(const long long a[2], long long (*b)[2]) {
+    (*b)[0] = a[0];
+    (*b)[1] = a[1];
+  }
+  inline void copy_bits(const ac_int<128,true> &a, long long (*b)[2]) {
+    (*b)[0] = a.to_int64();
+    (*b)[1] = a.slc<64>(64).to_int64();
+  }
+  inline void copy_bits(const long long a[2], ac_int<128,true> *b) {
+    *b = 0;
+    b->set_slc(0,ac_int<64,true>(a[0]));
+    b->set_slc(64,ac_int<64,true>(a[1]));
+  }
+  inline void copy_bits(const long long a[4], long long (*b)[4]) {
+    (*b)[0] = a[0];
+    (*b)[1] = a[1];
+    (*b)[2] = a[2];
+    (*b)[3] = a[3];
+  }
+  inline void copy_bits(const ac_int<256,true> &a, long long (*b)[4]) {
+    (*b)[0] = a.to_int64();
+    (*b)[1] = a.slc<64>(64).to_int64();
+    (*b)[2] = a.slc<64>(128).to_int64();
+    (*b)[3] = a.slc<64>(192).to_int64();
+  }
+  inline void copy_bits(const long long a[4], ac_int<256,true> *b) {
+    *b = 0;
+    b->set_slc(0,ac_int<64,true>(a[0]));
+    b->set_slc(64,ac_int<64,true>(a[1]));
+    b->set_slc(128,ac_int<64,true>(a[2]));
+    b->set_slc(192,ac_int<64,true>(a[3]));
+  }
+  inline void copy_bits(float f, int *x);
+  inline void copy_bits(double f, long long *x);
+  inline void copy_bits(int x, float *f);
+  inline void copy_bits(long long x, double *f);
+
+  inline void copy_bits(float f, ac_int<32,true> *x) {
+    int x_i;
+    copy_bits(f, &x_i);
+    *x = x_i;
+  }
+  inline void copy_bits(double f, ac_int<64,true> *x) {
+    long long x_i;
+    copy_bits(f, &x_i);
+    *x = x_i;
+  }
+  inline void copy_bits(const ac_int<32,true> &x, float *f) { copy_bits(x.to_int(), f); }
+  inline void copy_bits(const ac_int<64,true> &x, double *f) { copy_bits(x.to_int64(), f); }
+}
+
+enum ac_ieee_float_format { binary16, binary32, binary64, binary128, binary256};
+
+// Forward declarations for ac_ieee_float and bfloat16
+template<ac_ieee_float_format Format>
+class ac_ieee_float;
+namespace ac {
+  class bfloat16;
+}
+
+template<int W, int E>
+class ac_std_float {
+__AC_DATA_PRIVATE
+  ac_int<W,true> d;
+public:
+  static const int width = W;
+  static const int e_width = E;
+  static const int mant_bits = W - E - 1;
+  static const int exp_bias = (1 << (E-1)) - 1;
+  static const int min_exp = -exp_bias + 1;
+  static const int max_exp = exp_bias;
+  static const int mu_bits = mant_bits + 1;
+private:
+  typedef ac_int<mu_bits,false> mu_t;
+  typedef ac_int<mu_bits+1,false> mu1_t;
+  typedef ac_int<mu_bits+2,false> mu2_t;
+  typedef ac_int<mu_bits+1,true> m_t;   // mantissa in two's complement representation
+public:
+  typedef ac_int<E,true> e_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  static ac_std_float nan() {
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits-1, ac_int<e_width+1,true>(-1));
+    return r;
+  }
+  static ac_std_float inf() {
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits, ac_int<e_width,true>(-1));
+    return r;
+  }
+  static ac_std_float denorm_min() {   // smallest positive non zero value (subnorm if supported)
+    ac_std_float r;
+    r.d = 1;
+    return r;
+  }
+  static ac_std_float min() {   // smallest NORMAL positive non zero value
+    ac_std_float r;
+    r.d = 0;
+    r.d[width-1-e_width] = true;
+    return r;
+  }
+  static ac_std_float max() {   // largest pos finite value
+    ac_std_float r;
+    r.d = -1;
+    r.d[width-1] = false;
+    r.d[width-1-e_width] = false;
+    return r;
+  }
+  static ac_std_float epsilon() {
+    ac_int<e_width,true> exp = -mant_bits + exp_bias;
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits, exp);
+    return r;
+  }
+  ac_std_float() {}
+  ac_std_float(const ac_std_float &f) : d(f.d) {}
+  template<int WR, ac_q_mode QR>
+  ac_std_float<WR,E> convert() const {
+    ac_private::check_supported<QR>();
+    ac_std_float<WR,E> r;
+    if(W <= WR) {
+      r.d = 0;
+      r.d.set_slc(WR-W, d);
+    } else {
+      typedef ac_std_float<WR,E> r_t;
+      const int r_mant_bits = r_t::mant_bits;
+      const int r_mu_bits = r_t::mu_bits;
+      e_t f_e = d.template slc<E>(mant_bits);
+      bool f_normal = !!f_e;
+      mu_t mu = d;
+      mu[r_mant_bits] = f_normal;
+      ac_fixed<r_mu_bits+1,mu_bits+1,false,QR> r_rnd = mu;
+      bool rnd_ovf = r_rnd[r_mu_bits];
+      ac_int<r_mant_bits,false> m_r = r_rnd.template slc<r_mant_bits>(0);
+      e_t e_r = f_e + rnd_ovf;
+      r.d = m_r;
+      r.d.set_slc(r_mant_bits, e_r);
+      r.d[WR-1] = d[W-1];
+    }
+    return r;
+  }
+
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
+    static const bool rnd = QFX!=AC_TRN && QFX!=AC_TRN_ZERO;
+    static const bool need_rnd_bit = QFX != AC_TRN;
+    static const bool need_rem_bits = need_rnd_bit && QFX != AC_RND;
+    static const bool need_ovf = OFX != AC_WRAP;
+    static const int t_width = AC_MAX(mu_bits+1, WFX+!SFX) + need_rnd_bit + need_ovf;
+
+    bool f_sign, f_normal, f_zero, f_inf, f_nan;
+    mu_t f_mu;
+    e_t f_e;
+    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
+    if(map_inf) {
+      ac_fixed<WFX,IFX,SFX,QFX,OFX> rv;
+      if(f_sign)
+        rv.template set_val<AC_VAL_MIN>();
+      else
+        rv.template set_val<AC_VAL_MAX>();
+      return rv; 
+    }
+    AC_ASSERT(!f_inf && !f_nan, "Expects finite float (not Nan or Inf)");
+    m_t f_m = f_sign ? m_t(-f_mu) : m_t(f_mu);
+    typedef ac_int<t_width,true> t_t;
+    typedef ac_int<t_width+need_rem_bits,true> t2_t;
+    t_t t = f_m;
+    t <<= need_rnd_bit;
+    static const int lsb_src = -mant_bits;
+    static const int lsb_trg = IFX-WFX;
+    int rshift = lsb_trg - lsb_src - (int)f_e;
+
+    bool sticky_bit_rnd = false;
+    bool rshift_neg = rshift < 0;
+    if(need_rem_bits) {
+      t_t shifted_out_bits = t;
+      typedef ac_int< ac::template nbits< AC_MAX(lsb_trg - lsb_src - min_exp,1) >::val, false> shift_ut;
+      shifted_out_bits &= ~(t_t(0).bit_complement() << (shift_ut) rshift);
+      sticky_bit_rnd = !!shifted_out_bits & !rshift_neg;
+    }
+    bool ovf = false;
+    if(need_ovf) {
+      t_t shifted_out_bits = t < 0 ? t_t(~t) : t;
+      // shift right by -rshift + 1
+      //   +1 is OK since added extra MSB
+      typedef ac_int< ac::template nbits< AC_MAX(-(lsb_trg - lsb_src - max_exp + 1),1) >::val, false> shift_ut;
+      shifted_out_bits &= ~((t_t(0).bit_complement() >> 2) >> (shift_ut) ~rshift);
+      ovf = !!shifted_out_bits & rshift_neg;
+    }
+
+    t >>= rshift;
+
+    t[t_width-1] = t[t_width-1] ^ (ovf & (t[t_width-1] ^ f_sign));
+    t[t_width-2] = t[t_width-2] ^ (ovf & (t[t_width-2] ^ !f_sign));
+    t2_t t2 = t;
+    if(need_rem_bits) {
+      t2 <<= 1;
+      t2[0] = t2[0] | sticky_bit_rnd;
+    }
+
+    ac_fixed<WFX,WFX+need_rnd_bit+need_rem_bits,SFX,QFX,OFX> ri = t2;
+    ac_fixed<WFX,IFX,SFX,QFX,OFX> r = 0;
+    r.set_slc(0,ri.template slc<WFX>(0));
+    return r;
+  }
+
+  template<int W2>
+  explicit ac_std_float(const ac_std_float<W2,E> &f) {
+    *this = f.template convert<W,AC_RND_CONV>();
+  }
+  template<int WR, int ER, ac_q_mode QR>
+  ac_std_float<WR,ER> convert() const {
+    ac_private::check_supported<QR>();
+    typedef ac_std_float<WR,ER> r_t;
+    typedef typename r_t::e_t r_e_t;
+    int const r_mu_bits = r_t::mu_bits;
+    int const r_mant_bits = r_t::mant_bits;
+    int const r_min_exp = r_t::min_exp;
+    int const r_max_exp = r_t::max_exp;
+    int const r_exp_bias = r_t::exp_bias;
+    bool f_sign, f_normal, f_zero, f_inf, f_nan;
+    mu_t f_mu;
+    e_t f_e;
+    r_t r;
+    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
+    int exp = f_e;
+    ac_fixed<r_mu_bits+1, mu_bits+1,false,QR> r_rnd;
+    if(ER >= E) {
+      if(ER > E && !f_normal) {
+        int ls = f_mu.leading_sign();
+        int max_shift_left = f_e - r_min_exp + 1;
+        bool shift_exponent_limited = ls >= max_shift_left;
+        int shift_l = shift_exponent_limited ? max_shift_left : ls;
+        f_mu <<= shift_l;
+        exp -= shift_l;
+      }
+      r_rnd = f_mu;
+    } else {
+      int shift_r = r_min_exp - f_e;
+      typedef ac_fixed<r_mu_bits+1,mu_bits,false> t_t;
+      t_t r_t = f_mu;
+      bool sticky_bit = !!(f_mu & ~((~mu_t(0)) << mant_bits-r_mant_bits-1));
+      if(shift_r > 0) {
+        t_t shifted_out_bits = r_t;
+        shifted_out_bits &= ~((~t_t(0)) << shift_r);
+        sticky_bit |= !!shifted_out_bits;
+        r_t >>= shift_r;
+        exp += shift_r;
+      }
+      ac_fixed<r_mu_bits+2, mu_bits,false> r_t2 = r_t;
+      r_t2[0] = sticky_bit;
+      r_rnd = r_t2;
+    }
+    bool rnd_ovf = r_rnd[r_mu_bits];
+    ac_int<r_mant_bits,false> r_m = r_rnd.template slc<r_mant_bits>(0);
+    bool r_normal = r_rnd[r_mant_bits] | rnd_ovf;
+    exp += rnd_ovf;
+    bool exception = f_inf | f_nan | (exp > r_max_exp);
+    r_e_t r_e = exception ? -1 : (f_zero | !r_normal) ? 0 : exp + r_exp_bias;
+    if(exception) {
+      r_m = 0;
+      r_m[r_mant_bits-1] = f_nan;
+    }
+    r.d = r_m;
+    r.d.set_slc(r_mant_bits, r_e);
+    r.d[WR-1] = d[W-1];
+    return r;
+  }
+  template<int W2,int E2>
+  explicit ac_std_float(const ac_std_float<W2,E2> &f) {
+    *this = f.template convert<W,E,AC_RND_CONV>();
+  }
+  template<ac_ieee_float_format Format>
+  explicit ac_std_float(const ac_ieee_float<Format> &f);
+
+  explicit ac_std_float(const ac::bfloat16 &f);
+
+  template<ac_q_mode Q>
+  explicit ac_std_float(const ac_float<mu_bits+1,2,E,Q> &f) {
+    bool sign = f.mantissa() < 0;
+    m_t m_s = f.m.template slc<mu_bits+1>(0);
+    mu1_t m_u = sign ? (mu1_t) -m_s : (mu1_t) m_s;
+    bool most_neg_m = m_u[mu_bits];
+    bool is_max_exp = f.exp() == (1 << (E-1)) - 1;
+    ac_int<E,true> e = f.exp() + exp_bias + (most_neg_m & !is_max_exp);
+    mu_t m = m_u | ac_int<1,true>(most_neg_m & is_max_exp);
+    m[mant_bits] = m[mant_bits] | most_neg_m;
+    bool exp_dont_map = !e | e==-1;
+    m >>= !e;
+    m >>= 2*(e==-1);
+    // exp_dont_map guarantees subnornal => e = 0
+    e &= ac_int<1,true>(!exp_dont_map & !!m);
+    d = m.template slc<mant_bits>(0);
+    d.set_slc(mant_bits, e);
+    d[W-1] = sign;
+  }
+  template<ac_q_mode Q, int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  void assign_from(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
+    ac_private::check_supported<Q>();
+    bool sign = fx < 0.0;
+    ac_fixed<WFX+1,2,SFX> x = 0;
+    x.set_slc(0,fx.template slc<WFX+1>(0));
+    bool all_sign;
+    int ls = x.leading_sign(all_sign);
+    int max_shift_left = IFX-1 - min_exp + 1;
+    bool shift_exponent_limited = ls >= max_shift_left;
+    int shift_l = shift_exponent_limited ? max_shift_left : ls;
+    ac_fixed<WFX+1,2,false> x_u = sign ? (ac_fixed<WFX+1,2,false>) -x :  (ac_fixed<WFX+1,2,false>) x;
+    x_u <<= shift_l;
+    int exp = IFX-1;
+    exp -= shift_l;
+    ac_fixed<mu_bits+1,2,false,Q> m_rnd = x_u;
+    mu1_t m_u = 0;  m_u.set_slc(0, m_rnd.template slc<mu_bits+1>(0));
+    bool shiftr1 = m_u[mu_bits];  // msb
+    bool r_normal = m_u[mu_bits] | m_u[mu_bits-1];
+    m_u >>= shiftr1;
+    exp += shiftr1;
+    bool fx_zero = all_sign & !sign;
+    bool r_inf = (exp > max_exp) & !fx_zero;
+    if(Q==AC_TRN_ZERO) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_u |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    e_t e = r_inf ? -1 : (!r_normal) ? 0 : exp + exp_bias;
+    m_u &= ac_int<1,true>(!r_inf);
+    e &= ac_int<1,true>(r_normal);
+    d = m_u.template slc<mant_bits>(0);
+    d.set_slc(mant_bits, e);
+    d[W-1] = sign;
+  }
+  template<ac_q_mode Q, int WI, bool SI>
+  void assign_from(const ac_int<WI,SI> &x) {
+    this->template assign_from<Q>(ac_fixed<WI,WI,SI>(x));
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  explicit ac_std_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
+    assign_from<AC_RND_CONV>(fx);
+  }
+  explicit ac_std_float(float f) {
+    const int w_bits = sizeof(f)*8;
+    const int m_bits = std::numeric_limits<float>::digits;
+    const int e_bits = w_bits - m_bits;
+    ac_int<w_bits,true> t_i;
+    ac::copy_bits(f, &t_i);
+    ac_std_float<w_bits,e_bits> t;
+    t.set_data(t_i);
+    *this = ac_std_float(t);
+  }
+  explicit ac_std_float(double f) {
+    const int w_bits = sizeof(f)*8;
+    const int m_bits = std::numeric_limits<double>::digits;
+    const int e_bits = w_bits - m_bits;
+    ac_int<w_bits,true> t_i;
+    ac::copy_bits(f, &t_i);
+    ac_std_float<w_bits,e_bits> t;
+    t.set_data(t_i);
+    *this = ac_std_float(t);
+  }
+  explicit ac_std_float(int x) {
+    *this = ac_std_float(ac_fixed<32,32,true>(x));
+  }
+  explicit ac_std_float(long long x) {
+    *this = ac_std_float(ac_fixed<64,64,true>(x));
+  }
+  const ac_int<W,true> &data() const { return d; }
+  void set_data(const ac_int<W,true> &data, bool assert_on_nan=false, bool assert_on_inf=false) {
+    d = data;
+    if(assert_on_nan)
+      AC_ASSERT(!isnan(), "Float is NaN");
+    if(assert_on_inf)
+      AC_ASSERT(!isinf(), "Float is Inf");
+  }
+  int fpclassify() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    if(e) {
+      if(e == -1)
+        return !(ac_int<mant_bits,false>)d ? FP_INFINITE : FP_NAN;
+      else
+        return FP_NORMAL;
+    }
+    else
+      return !(ac_int<mant_bits,false>)d ? FP_ZERO : FP_SUBNORMAL;
+  }
+  bool isfinite() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    return e != -1;
+  }
+  bool isnormal() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    return (e || !(ac_int<mant_bits,false>)d)&& e != -1;
+  }
+  bool isnan() const {
+    if(isfinite())
+      return false;
+    ac_int<mant_bits,false> m = d;
+    return !!m;
+  }
+  bool isinf() const {
+    if(isfinite())
+      return false;
+    ac_int<mant_bits,false> m = d;
+    return !m;
+  }
+  const ac_float<mant_bits+2,2,E,AC_TRN> to_ac_float() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    bool normal = !!e;
+    bool sign = d[W-1];
+    bool inf = e == -1;
+    ac_int<mant_bits,false> m = d;
+    ac_int<mant_bits+1,false> m1 = m;
+    m1[mant_bits] = normal;
+    ac_int<mant_bits+2,true> m_s = sign ? -m1 : (ac_int<mant_bits+2,true>) m1;
+    ac_fixed<mant_bits+2,2,true> fx = 0;
+    fx.set_slc(0, m_s);
+    e -= exp_bias;
+    // if number is subnormal, e will be MIN_EXP + 1 (10...01), but it needs to be
+    //   MIN_EXP + 2  (10...010)
+    e[0] = e[0] & normal;
+    e[1] = e[1] | !normal;
+    // normalization by at most 2 places
+    bool shiftl1 = !(fx[mant_bits+1] ^ fx[mant_bits]);
+    bool shiftl2 = shiftl1 & !(fx[mant_bits+1] ^ fx[mant_bits-1]);
+    fx <<= shiftl1;
+    fx <<= shiftl2;
+    e -= shiftl1 + shiftl2;
+    e = inf ? value<AC_VAL_MAX>(e) : e;
+    fx = inf ? (sign ? value<AC_VAL_MIN>(fx) : value<AC_VAL_MAX>(fx)) : fx;
+    return ac_float<mant_bits+2,2,E,AC_TRN>(fx, e, false);
+  }
+  float to_float() const {
+    ac_std_float<32,8> t(*this);
+    float f;
+    ac::copy_bits(t.d, &f);
+    return f;
+  }
+  double to_double() const {
+    ac_std_float<64,11> t(*this);
+    double f;
+    ac::copy_bits(t.d, &f);
+    return f;
+  }
+private:
+  void extract(mu_t &m, e_t &e, bool &sign, bool &normal, bool &zero, bool &inf, bool &nan, bool biased_exp=false, bool no_subnormals=false) const {
+    e = d.template slc<E>(mant_bits);
+    bool exception = e == -1;
+    normal = !!e | no_subnormals;
+    m = d;
+    bool m_zero = !m.template slc<mant_bits>(0);
+    zero = (!e) & (no_subnormals | m_zero);
+    m[mant_bits] = !!e;
+    if(!biased_exp) {
+      e -= exp_bias;
+      e += !normal;
+    }
+    sign = d[W-1];
+    inf = exception & m_zero;
+    nan = exception & !m_zero;
+  }
+public:
+  static ac_std_float zero() {
+    ac_std_float r;
+    r.d = 0;
+    return r;
+  }
+  static ac_std_float one() {
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits, ac_int<E,false>(exp_bias));
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float add_generic(const ac_std_float &op2) const {
+    ac_private::check_supported<QR>();
+    // +1 for possible negation, +1 for bit growth due to addition
+    const int tr_t_iwidth = mu_bits + 1 + 1;
+    // extra bit for rounding, extra bit for left shift
+    const int tr_t_width = tr_t_iwidth + 1 + 1;
+    typedef ac_fixed<tr_t_width,tr_t_iwidth,true> add_t;
+    typedef ac_fixed<mu_bits,mu_bits+1,false> r_un_t;
+    e_t op1_e, op2_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan;
+    mu_t op1_mu, op2_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    m_t op1_m = op1_sign ? m_t(-op1_mu) : m_t(op1_mu);
+    op1_m &= m_t(No_SubNormals & op1_zero ? 0 : -1);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    m_t op2_m = op2_sign ? m_t(-op2_mu) : m_t(op2_mu);
+    op2_m &= m_t(No_SubNormals & op2_zero ? 0 : -1);
+
+    unsigned op1_e_b = ac_int<E,false>(op1_e) + !op1_normal;
+    unsigned op2_e_b = ac_int<E,false>(op2_e) + !op2_normal;
+    int e_dif = op1_e_b - op2_e_b;
+    bool e1_lt_e2 = e_dif < 0;
+    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
+
+    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
+    m_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
+    add_t shifted_out_bits = op_lshift;
+    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
+    bool sticky_bit = !!shifted_out_bits;
+
+    op_lshift >>= (unsigned) e_dif;
+    add_t add_r = op_lshift + op_no_shift;
+    int exp = ( (e1_lt_e2 & !op2_zero) | op1_zero ? op2_e_b : op1_e_b);
+    bool all_sign;
+    int ls = add_r.leading_sign(all_sign);
+    bool r_zero = !add_r[0] & all_sign;
+    // +1 to account for bit growth of add_r
+    int max_shift_left = exp + (- min_exp - exp_bias + 1);
+    bool shift_exponent_limited = ls >= max_shift_left;
+    int shift_l = shift_exponent_limited ? max_shift_left : ls;
+    add_r <<= shift_l;
+    add_r[0] = add_r[0] | sticky_bit;
+    ac_fixed<mu_bits+1,mu_bits+2,true,QR> r_rnd = add_r;
+    typedef ac_int<mu_bits+1,false> t_h;
+    t_h t = add_r.to_ac_int();
+    bool rnd_ovf = QR == AC_RND_CONV && t == t_h(-1);
+    bool r_sign = r_rnd[mu_bits] ^ rnd_ovf;
+    bool shift_r = rnd_ovf | (r_sign & !r_rnd.template slc<mu_bits>(0));
+    r_un_t r_un =  r_sign ? (r_un_t) -r_rnd : (r_un_t) r_rnd;
+    // get rid of implied bit, assign to ac_int
+    bool r_normal = r_un[mant_bits] | shift_r;
+    r_zero |= No_SubNormals & !r_normal;
+    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
+    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 1) + shift_r;
+    bool r_inf = exp > max_exp + exp_bias;
+    if(QR==AC_TRN_ZERO) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    bool r_nan = op1_nan | op2_nan | ((op1_inf & op2_inf) & (op1_sign ^ op2_sign));
+    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float add(const ac_std_float &op2) const {
+#ifndef AC_STD_FLOAT_ADD_OVERRIDE
+    return add_generic<QR,No_SubNormals>(op2);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_ADD_OVERRIDE<QR,No_SubNormals>(*this, op2);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float sub(const ac_std_float &op2) const {
+    return add<QR,No_SubNormals>(-op2);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float mult_generic(const ac_std_float &op2) const {
+    ac_private::check_supported<QR>();
+    e_t op1_e, op2_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan;
+    mu_t op1_mu, op2_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    bool r_sign = op1_sign ^ op2_sign;
+    bool r_nan = op1_nan | op2_nan | (op1_inf & op2_zero) | (op1_zero & op2_inf);
+    bool r_zero = op1_zero | op2_zero;  // r_nan takes precedence later on
+    int exp = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
+    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
+    int max_shift_left = exp + (- min_exp - exp_bias + 1);
+    int shift_l = 0;
+    bool shift_l_1 = false;
+    typedef ac_int<mu_bits+1,false> t_h;
+    typedef ac_int<mu_bits-1,false> t_l;
+    t_h p_h;
+    t_l p_l = p;
+    bool r_normal;
+    bool r_inf;
+    ac_fixed<mu_bits,mu_bits+2,false,QR> r_rnd;
+    ac_int<mant_bits,false> m_r;
+    if(max_shift_left >= 0) {
+      r_inf = exp > max_exp + exp_bias;
+      bool exp_is_max = exp == max_exp + exp_bias;
+      bool exp_is_max_m1 = exp == max_exp + exp_bias - 1;
+      unsigned ls = No_SubNormals ? 0 : (unsigned) (op1_normal ? op2_mu : op1_mu).leading_sign();
+      bool shift_exponent_limited = ls >= (unsigned) max_shift_left;
+      shift_l = shift_exponent_limited ? (unsigned) max_shift_left : ls;
+      p <<= (unsigned) shift_l;
+      exp -= shift_l;
+      shift_l_1 = !(shift_exponent_limited | p[2*mu_bits-1]);
+      p = shift_l_1 ? p << 1 : p;
+      exp += !shift_l_1;
+      p_h = p >> (mu_bits-1);
+      p_l &= (t_l(-1) >> shift_l) >> shift_l_1;
+      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
+      p_bef_rnd <<= 1;
+      p_bef_rnd[0] = !!p_l;
+      r_rnd = p_bef_rnd;
+      m_r = r_rnd.template slc<mant_bits>(0);
+      bool rnd_ovf = QR == AC_RND_CONV && p_h == t_h(-1);
+      exp += rnd_ovf;
+      r_inf |= (exp_is_max & (!shift_l_1 | rnd_ovf)) | (exp_is_max_m1 & !shift_l_1 & rnd_ovf);
+      r_normal = r_rnd[mant_bits] | rnd_ovf;
+      r_zero |= !r_normal & No_SubNormals;
+      if(QR==AC_TRN_ZERO) {
+        exp = r_inf ? max_exp + exp_bias : exp;
+        m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+        r_inf = false;
+      }
+    } else {
+      shift_l = max_shift_left;
+      exp -= shift_l;
+      unsigned shift_r_m1 = ~shift_l;
+      p_h = p >> (mu_bits-1);
+      t_h shifted_out_bits = p_h;
+      shifted_out_bits &= ~((~t_h(1)) << shift_r_m1);
+      p_h >>= shift_r_m1;
+      p_h >>= 1;
+      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
+      p_bef_rnd <<= 1;
+      p_bef_rnd[0] = !!p_l | !!shifted_out_bits;
+      r_rnd = p_bef_rnd;
+      m_r = r_rnd.template slc<mant_bits>(0);
+      r_normal = false;
+      r_inf = false;
+      r_zero |= No_SubNormals;
+    }
+    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float mult(const ac_std_float &op2) const {
+#ifndef AC_STD_FLOAT_MULT_OVERRIDE
+    return mult_generic<QR,No_SubNormals>(op2);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_MULT_OVERRIDE<QR,No_SubNormals>(*this, op2);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float div_generic(const ac_std_float &op2) const {
+    ac_private::check_supported<QR>();
+    e_t op1_e, op2_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan;
+    mu_t op1_mu, op2_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    bool r_sign = op1_sign ^ op2_sign;
+    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
+    op1_mu <<= ls_op1;
+    int ls_op2 = No_SubNormals ? 0 : (unsigned) op2_mu.leading_sign();
+    op2_mu <<= ls_op2;
+    int exp = ac_int<E,false>(op1_e) - ac_int<E,false>(op2_e) + !op1_normal - !op2_normal - ls_op1 + ls_op2 + exp_bias;
+    ac_int<mu_bits+2,false> q0 = 0;
+    bool exact = true;
+    bool div_by_zero = op2_zero;
+#ifdef __SYNTHESIS__
+    div_by_zero = false;
+#endif
+    if(!div_by_zero) {
+      AC_STD_FLOAT_FX_DIV_OVERRIDE(op1_mu, op2_mu, q0, exact);
+    }
+    ac_int<mu_bits+3,false> q = q0;
+    q <<= 1;
+    int shift_r = min_exp + exp_bias - exp;
+    bool sticky_bit = !exact;
+    if(shift_r >= 0) {
+      typedef ac_int<mu_bits+3,false> t_t;
+      t_t shifted_out_bits = q;
+      shifted_out_bits &= ~((~t_t(0)) << shift_r);
+      sticky_bit |= !!shifted_out_bits;
+      q >>= shift_r;
+      exp += shift_r;
+    } else {
+      bool shift_l = !q[mu_bits+2];
+      q <<= shift_l;
+      exp -= shift_l;
+    }
+    q[0] = q[0] | sticky_bit;
+    ac_fixed<mu_bits+1,mu_bits+4,false,QR> r_rnd = q;
+    bool rnd_ovf = r_rnd[mu_bits];
+    ac_int<mant_bits,false> m_r = r_rnd.template slc<mant_bits>(0);
+    bool r_normal = r_rnd[mant_bits] | rnd_ovf;
+    bool r_nan = op1_nan | op2_nan | (op1_zero & op2_zero) | (op1_inf & op2_inf);
+    bool r_zero = op1_zero | op2_inf;
+    r_zero |= !r_normal & No_SubNormals;
+    exp += rnd_ovf;
+    bool r_inf0 = op1_inf | op2_zero;  // this is not affected by rounding
+    bool r_inf = (!r_zero & (exp > max_exp + exp_bias)) | r_inf0;
+    if(QR==AC_TRN_ZERO && !r_inf0) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    bool exception = r_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float div(const ac_std_float &op2) const {
+#ifndef AC_STD_FLOAT_DIV_OVERRIDE
+    return div_generic<QR,No_SubNormals>(op2);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_DIV_OVERRIDE<QR,No_SubNormals>(*this, op2);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float fma_generic(const ac_std_float &op2, const ac_std_float &op3) const {
+    ac_private::check_supported<QR>();
+    e_t op1_e, op2_e, op3_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero, op3_normal, op3_sign, op3_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan, op3_inf, op3_nan;
+    mu_t op1_mu, op2_mu, op3_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    op3.extract(op3_mu, op3_e, op3_sign, op3_normal, op3_zero, op3_inf, op3_nan, true, No_SubNormals);
+    if(No_SubNormals)
+      op3_mu &= mu_t(op3_zero ? 0 : -1);
+    bool mult_sign = (op1_sign ^ op2_sign) | (op1_zero & op2_inf) | (op1_inf & op1_zero);
+    bool mult_nan = op1_nan | op2_nan | (op1_zero & op2_inf) | (op1_inf & op2_zero);
+    bool mult_zero = op1_zero | op2_zero;  // mult_nan has precedence later on
+    int mult_exp_b = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
+    mult_exp_b |= ac_int<E,false>( op1_inf | op2_inf ? -1 : 0 );
+    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
+    if(No_SubNormals)
+      p &= ac_int<2*mu_bits,false>(mult_zero ? 0 : -1);
+    bool mult_inf = op1_inf | op2_inf;
+
+    bool diff_signs = mult_sign ^ op3_sign;
+    bool toggle_r_sign = mult_sign;
+    m_t op3_m = diff_signs ? m_t(-op3_mu) : m_t(op3_mu);
+    unsigned op3_e_b = ac_int<E,false>(op3_e) + !op3_normal;
+
+    int e_dif = mult_exp_b - op3_e_b;
+    bool emult_lt_e3 = e_dif < 0;
+    e_dif = (mult_zero | op3_zero) ? 0 : emult_lt_e3 ? -e_dif : e_dif;
+
+    typedef ac_int<2*mu_bits+4,true> add_t;
+    add_t op3_m_s = op3_m;
+    op3_m_s <<= mu_bits+1;   // mult: ii.ffff, op3: i.ff
+    add_t p_s = p;
+    p_s <<= 2;
+    add_t op_lshift = emult_lt_e3 ? p_s : op3_m_s;
+    add_t op_no_shift = emult_lt_e3 ? op3_m_s : p_s;
+
+    add_t shifted_out_bits = op_lshift;
+    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
+    bool sticky_bit = !!shifted_out_bits;
+
+    op_lshift >>= (unsigned) e_dif;
+    add_t add_r = op_lshift + op_no_shift;
+    int exp = ( (emult_lt_e3 & !op3_zero) | mult_zero ? op3_e_b : mult_exp_b);
+
+    bool all_sign;
+    int ls = add_r.leading_sign(all_sign);
+    // no bit growth of add_r
+    int max_shift_left = exp + (- min_exp - exp_bias + 2);
+    bool shift_exponent_limited = ls >= max_shift_left;
+    int shift_l = shift_exponent_limited ? max_shift_left : ls;
+    add_r <<= shift_l;
+    add_r[0] = add_r[0] | sticky_bit;
+
+    ac_fixed<mu_bits+1,2*mu_bits+4,true,QR> r_rnd = add_r;
+
+    typedef ac_int<mu_bits+1,false> t_h;
+    t_h t = add_r.template slc<mu_bits+1>(mu_bits+2);
+    bool rnd_ovf = QR == AC_RND_CONV && !add_r[2*mu_bits+3] && t == t_h(-1);
+    bool r_neg = r_rnd[mu_bits] ^ rnd_ovf;
+    bool r_sign = op3_inf ? op3_sign : mult_inf ? mult_sign : r_neg ^ toggle_r_sign;
+    ac_int<mu_bits+1,true> r_rnd_i = r_rnd.template slc<mu_bits+1>(0);
+    bool r_zero = !rnd_ovf & !r_rnd_i;
+    bool shift_r = rnd_ovf | (r_neg & !r_rnd_i.template slc<mu_bits>(0));
+    typedef ac_int<mu_bits,false> r_un_t;
+    r_un_t r_un =  r_neg ? (r_un_t) -r_rnd_i : (r_un_t) r_rnd_i;
+    // get rid of implied bit, assign to ac_int
+    bool r_normal = r_un[mant_bits] | shift_r;
+    r_zero |= No_SubNormals & !r_normal;
+    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
+    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 2) + shift_r;
+    bool r_inf = mult_inf | op3_inf | (exp > max_exp + exp_bias);
+    if(QR==AC_TRN_ZERO) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    bool r_nan = op3_nan | mult_nan | ((op3_inf & (op1_inf | op2_inf)) & (op3_sign ^ mult_sign));
+    bool exception = op3_inf | mult_inf | op3_nan | mult_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float fma(const ac_std_float &op2, const ac_std_float &op3) const {
+#ifndef AC_STD_FLOAT_FMA_OVERRIDE
+    return fma_generic<QR,No_SubNormals>(op2,op3);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_FMA_OVERRIDE<QR,No_SubNormals>(*this,op2,op3);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float sqrt_generic() const {
+    ac_private::check_supported<QR>();
+    const bool rnd = QR != AC_TRN_ZERO;   // need msb(rounded bits)
+    const bool rbits = QR != AC_TRN_ZERO; // need bits after msb(rounded bits)
+    e_t op1_e;
+    bool op1_normal, op1_sign, op1_zero;
+    bool op1_inf, op1_nan;
+    mu_t op1_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
+    op1_mu <<= ls_op1;
+    op1_mu[mu_bits-1] = true;  // Since it is normalized, zero is captured by op1_zero
+
+    bool exp_odd = (op1_e  ^ !op1_normal ^ ls_op1 ^ exp_bias) & 1;
+
+    int exp = ac_int<E,false>(op1_e) + !op1_normal - ls_op1 - exp_bias;
+    exp >>= 1;   // divide by 2, truncate towards -inf
+
+    ac_int<mu_bits+1,false> op1_mi = op1_mu;
+    op1_mi <<= exp_odd;
+    ac_int<mu_bits+rnd,false> sq_rt;
+    bool sticky_bit = ac::fx_sqrt(op1_mi, sq_rt);
+    bool r_normal = true;  // true for most practical cases on W,E
+    if(mant_bits > -min_exp) {
+      int exp_over = min_exp - exp;
+      if(exp_over > 0) {
+        if(rbits) {
+          typedef ac_int<mu_bits+rnd,false> t_t;
+          t_t shifted_out_bits = sq_rt;
+          shifted_out_bits &= ~((~t_t(0)) << exp_over);
+          sticky_bit |= !!shifted_out_bits;
+        }
+        sq_rt >>= exp_over;
+        exp = min_exp;
+        r_normal = false;
+      }
+    }
+    // rounding should not trigger overflow (unless truncate towards +inf which is currently not supported)
+    ac_fixed<mu_bits+rnd+rbits,1,false> sq_rt_rnd = 0;
+    if(rbits)
+      sq_rt_rnd[0] = sq_rt_rnd[0] | sticky_bit;
+    sq_rt_rnd.set_slc(rbits, sq_rt);
+    ac_fixed<mu_bits,1,false,QR> sq_rt_fx = sq_rt_rnd;
+
+    ac_int<mant_bits,false> m_r = sq_rt_fx.template slc<mant_bits>(0);
+    bool r_nan = op1_nan | (op1_sign & !op1_zero);
+    bool r_zero = op1_zero;
+    r_zero |= !r_normal & No_SubNormals;
+    bool r_inf = op1_inf;
+    bool exception = r_nan | r_inf;
+    exp += exp_bias;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float sqrt() const {
+#ifndef AC_STD_FLOAT_SQRT_OVERRIDE
+    return sqrt_generic<QR,No_SubNormals>();
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_SQRT_OVERRIDE<QR,No_SubNormals>(*this);
+#endif
+  }
+  ac_std_float operator +(const ac_std_float &op2) const {
+    return add<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float operator -(const ac_std_float &op2) const {
+    return sub<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float operator *(const ac_std_float &op2) const {
+    return mult<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float operator /(const ac_std_float &op2) const {
+    return div<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float &operator +=(const ac_std_float &op2) {
+    *this = operator +(op2);
+    return *this;
+  }
+  ac_std_float &operator -=(const ac_std_float &op2) {
+    *this = operator -(op2);
+    return *this;
+  }
+  ac_std_float &operator *=(const ac_std_float &op2) {
+    *this = operator *(op2);
+  }
+  ac_std_float &operator /=(const ac_std_float &op2) {
+    *this = operator /(op2);
+    return *this;
+  }
+  bool operator ==(const ac_std_float &op2) const {
+    return ((d == op2.d) && !isnan()) || (operator !() && op2.operator !());
+  }
+  bool operator !=(const ac_std_float &op2) const {
+    return !operator ==(op2);
+  }
+  bool magnitude_lt(const ac_std_float &op2) const {
+    return ac_int<W-1,false>(d) < ac_int<W-1,false>(op2.d);
+  }
+  bool neg() const { return d[W-1]; }
+  bool operator <(const ac_std_float &op2) const {
+    return
+      operator !=(op2) && ( (neg() && !op2.neg()) || (!(neg() ^ op2.neg()) && neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator >=(const ac_std_float &op2) const {
+    return
+      (operator ==(op2) || (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator >(const ac_std_float &op2) const {
+    return
+      operator !=(op2)
+      && ( (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator <=(const ac_std_float &op2) const {
+    return
+      (operator == (op2) || (neg() && !op2.neg()) || (!neg() ^ op2.neg() && neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator !() const { return !ac_int<W-1,false>(d); }
+  ac_std_float operator -() const {
+    ac_std_float r(*this);
+    r.d[W-1] = !d[W-1];
+    return r;
+  }
+  ac_std_float operator +() const {
+    return ac_std_float(*this);
+  }
+  ac_std_float abs() const {
+    ac_std_float r(*this);
+    r.d[W-1] = false;
+    return r;
+  }
+  ac_std_float copysign(const ac_std_float &op2) const {
+    ac_std_float r(*this);
+    r.d[W-1] = op2.d[W-1];
+    return r;
+  }
+  bool signbit() const {
+    return d[W-1];
+  }
+  void set_signbit(bool s) {
+    d[W-1] = s;
+  }
+  ac_std_float ceil() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    bool sign = d[W-1];
+    if(!d.template slc<W-1>(0))
+      return *this;
+    if(e < exp_bias) {
+      return sign ? zero() : one();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        bool non_zero_fractional = !!(m & ~mask);
+        if(!sign) {
+          m |= ~mask;
+          mu_t mu = m + mant_t(non_zero_fractional);
+          e += mu[mant_bits];
+          r.d.set_slc(mant_bits, e);
+          m = mu;
+        }
+        m &= mask;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+  ac_std_float floor() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    bool sign = d[W-1];
+    if(!d.template slc<W-1>(0))
+      return *this;
+    if(e < exp_bias) {
+      return sign ? -one() : zero();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        bool non_zero_fractional = !!(m & ~mask);
+        if(sign) {
+          m |= ~mask;
+          mu_t mu = m + mant_t(non_zero_fractional);
+          e += mu[mant_bits];
+          r.d.set_slc(mant_bits, e);
+          m = mu;
+        }
+        m &= mask;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+  ac_std_float trunc() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    if(e < exp_bias) {
+      return zero();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        m &= mask;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+  ac_std_float round() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    if(e < exp_bias-1) {
+      return zero();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias -1 - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        m |= ~mask;
+        mu_t mu = m + mant_t(1);
+        e += mu[mant_bits];
+        r.d.set_slc(mant_bits, e);
+        m = mu;
+        m &= mask << 1;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+};
+
+template<int W, int E>
+inline std::ostream& operator << (std::ostream &os, const ac_std_float<W,E> &x) {
+  // for now just print the raw ac_int for it
+  os << x.data().to_string(AC_HEX);
+  return os;
+}
+
+namespace ac {
+  // Type punning: using memcpy to avoid strict aliasing
+  inline void copy_bits(float f, int *x) {
+    std::memcpy(x, &f, sizeof(int));
+  }
+  inline void copy_bits(double f, long long *x) {
+    std::memcpy(x, &f, sizeof(long long));
+  }
+  inline void copy_bits(int x, float *f) {
+    std::memcpy(f, &x, sizeof(float));
+  }
+  inline void copy_bits(long long x, double *f) {
+    std::memcpy(f, &x, sizeof(double));
+  }
+
+  inline void copy_bits(const ac_std_float<32,8> &x, float *f) {
+    copy_bits(x.data().to_int(), f);
+  }
+  inline void copy_bits(const ac_std_float<64,11> &x, double *f) {
+    copy_bits(x.data().to_int64(), f);
+  }
+}
+
+template<ac_ieee_float_format Format>
+class ac_ieee_float_base {
+public:
+  static const int width = 1 << ((int)Format + 4);
+  // exponents are {5,8,11,15,19}, but the first three are specialized elsewhere
+  static const int e_width = 11 + ((int)Format - binary64)*4; // 11, 15, 19
+  static const int lls = width >> 6;
+  typedef long long (data_t)[lls];
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+  typedef ac_std_float<width,e_width> helper_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) {
+    ac::copy_bits(f.d, &d);
+  }
+  explicit ac_ieee_float_base(const helper_t &op) {
+    ac::copy_bits(op.data(), &d);
+  }
+  explicit ac_ieee_float_base(double f);
+protected:
+  helper_t to_helper_t() const {
+    ac_int<width,true> dat;
+    ac::copy_bits(d, &dat);
+    helper_t x;
+    x.set_data(dat);
+    return x;
+  }
+public:
+  void set_data(const data_t &op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const { return d[lls-1] < 0; }
+  void set_signbit(bool s) {
+    ac_int<64,true> t(d[lls-1]);
+    t[63] = s;
+    d[lls-1] = t.to_int64();
+  }
+};
+
+template<ac_ieee_float_format Format>
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<Format> &x) {
+  // for now print the 128 and 256 as raw ac_int
+  os << x.data_ac_int().to_string(AC_HEX);
+  return os;
+}
+
+template<> class ac_ieee_float_base<binary16> {
+public:
+  static const int width = 16;
+  static const int e_width = 5;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+  typedef short data_t;
+  typedef ac_std_float<width,e_width> helper_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
+  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
+  explicit ac_ieee_float_base(float f) : d((short)ac_std_float<width,e_width>(f).data().to_int()) {}
+protected:
+  helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+  float to_float() const {
+    ac_std_float_t t;
+    t.set_data(this->data_ac_int());
+    return t.to_float();
+  }
+#if __cplusplus > 199711L
+  explicit operator float() const { return to_float(); }
+#endif
+  void set_data(short op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const { return d < 0; }
+  void set_signbit(bool s) {
+    ac_int<width,true> t(d);
+    t[width-1] = s;
+    d = t;
+  }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary16> &x) {
+  os << x.to_float();
+  return os;
+}
+
+struct float_helper {
+  float d;
+  float_helper() {}
+  float_helper(float f) { d = f; }
+  float_helper(const float_helper &f) { d = f.d; }
+  float_helper(const float_helper &f, bool no_subnormals) {
+    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
+  }
+  float_helper(const ac_std_float<32,8> &f) { set_data(f.data().to_int()); }
+  template<ac_q_mode Q>
+  float_helper(const ac_float<25,2,8,Q> &f) : d(f.to_float()) {}
+  const float &data() const { return d; }
+  void set_data(int data) { ac::copy_bits(data, &d); }
+  void set_data(float data) { d = data; }
+  operator float() const { return d; }
+  float to_float() const { return d; }
+  int fpclassify() const { return std::fpclassify(d); }
+  bool isfinite() const { return std::isfinite(d); }
+  bool isnormal() const { return std::isnormal(d); }
+  bool isinf() const { return std::isinf(d); }
+  bool isnan() const { return std::isnan(d); }
+  static float nan() { return ac_std_float<32,8>::nan().to_float(); }
+  static float inf() { return ac_std_float<32,8>::inf().to_float(); }
+  static float denorm_min() { return ac_std_float<32,8>::denorm_min().to_float(); }
+  static float min() { return ac_std_float<32,8>::min().to_float(); }
+  static float max() { return ac_std_float<32,8>::max().to_float(); }
+  static float epsilon() { return ac_std_float<32,8>::epsilon().to_float(); }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper add(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) + float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper sub(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) - float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper mult(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) * float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper div(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) / float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper fma(const float_helper &op2, const float_helper &op3) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( ::fmaf(float_helper(*this, No_SubNormals), float_helper(op2, No_SubNormals), float_helper(op3, No_SubNormals)), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper sqrt() const {
+    ac_private::check_supported2<QR>();
+    return float_helper( ::sqrtf(float_helper(*this, No_SubNormals)), No_SubNormals);
+  }
+  float_helper ceil() const { return float_helper(std::ceil(d)); }
+  float_helper floor() const { return float_helper(std::floor(d)); }
+  float_helper trunc() const { return float_helper(::truncf(d)); }
+  float_helper round() const { return float_helper(::roundf(d)); }
+};
+
+template<> class ac_ieee_float_base<binary32> {
+public:
+  static const int width = 32;
+  static const int e_width = 8;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+  typedef float data_t;
+  typedef float_helper helper_t;
+#else
+  typedef int data_t;
+  typedef ac_std_float<width,e_width> helper_t;
+#endif
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
+  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
+  explicit ac_ieee_float_base(float f) { ac::copy_bits(f, &d); }
+protected:
+  helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+#if __cplusplus > 199711L
+  explicit operator float() const {
+    float f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+#endif
+  float to_float() const {
+    float f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+  void set_data(int op) { ac::copy_bits(op, &d); }
+  void set_data(float op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const {
+    int x; ac::copy_bits(d, &x);
+    return x < 0;
+  }
+  void set_signbit(bool s) {
+    ac_int<width,true> t;
+    ac::copy_bits(d, &t);
+    t[width-1] = s;
+    ac::copy_bits(t, &d);
+  }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary32> &x) {
+  os << x.to_float();
+  return os;
+}
+
+struct double_helper {
+  double d;
+  double_helper() {}
+  double_helper(double f) { d = f; }
+  double_helper(const float_helper &f) { d = f.d; }
+  double_helper(const double_helper &f, bool no_subnormals) {
+    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
+  }
+  double_helper(const ac_std_float<64,11> &f) { set_data(f.data().to_int64()); }
+  template<ac_q_mode Q>
+  double_helper(const ac_float<54,2,11,Q> &f) : d(f.to_double()) {}
+  const double &data() const { return d; }
+  void set_data(long long data) {
+    ac::copy_bits(data, &d);
+  }
+  void set_data(double data) { d = data; }
+  operator double() const { return d; }
+  double to_double() const { return d; }
+  int fpclassify() const { return std::fpclassify(d); }
+  bool isfinite() const { return std::isfinite(d); }
+  bool isnormal() const { return std::isnormal(d); }
+  bool isinf() const { return std::isinf(d); }
+  bool isnan() const { return std::isnan(d); }
+  static double nan() { return ac_std_float<64,11>::nan().to_double(); }
+  static double inf() { return ac_std_float<64,11>::inf().to_double(); }
+  static double denorm_min() { return ac_std_float<64,11>::denorm_min().to_double(); }
+  static double min() { return ac_std_float<64,11>::min().to_double(); }
+  static double max() { return ac_std_float<64,11>::max().to_double(); }
+  static double epsilon() { return ac_std_float<64,11>::epsilon().to_double(); }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper add(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) + double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper sub(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) - double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper mult(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) * double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper div(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) / double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper fma(const double_helper &op2, const double_helper &op3) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( ::fma((double) double_helper(*this, No_SubNormals), (double) double_helper(op2, No_SubNormals), (double) double_helper(op3, No_SubNormals)), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper sqrt() const {
+    ac_private::check_supported2<QR>();
+    return double_helper( ::sqrt((double) double_helper(*this, No_SubNormals)), No_SubNormals);
+  }
+  double_helper ceil() const { return double_helper(std::ceil(d)); }
+  double_helper floor() const { return double_helper(std::floor(d)); }
+  double_helper trunc() const { return double_helper(::trunc(d)); }
+  double_helper round() const { return double_helper(::round(d)); }
+};
+
+template<> class ac_ieee_float_base<binary64> {
+public:
+  static const int width = 64;
+  static const int e_width = 11;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+  typedef double data_t;
+  typedef double_helper helper_t;
+#else
+  typedef long long data_t;
+  typedef ac_std_float<width,e_width> helper_t;
+#endif
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
+  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
+  explicit ac_ieee_float_base(double f) { ac::copy_bits(f, &d); }
+protected:
+  helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+#if __cplusplus > 199711L
+  explicit operator double() const {
+    double f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+#endif
+  double to_double() const {
+    double f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+  void set_data(long long op) { ac::copy_bits(op, &d); }
+  void set_data(double op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const {
+    long long x; ac::copy_bits(d, &x);
+    return x < 0;
+  }
+  void set_signbit(bool s) {
+    ac_int<width,true> t;
+    ac::copy_bits(d, &t);
+    t[width-1] = s;
+    ac::copy_bits(t, &d);
+  }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary64> &x) {
+  os << x.to_double();
+  return os;
+}
+
+namespace ac_private {
+  template<ac_ieee_float_format Format, typename T2>
+  struct ac_ieee_float_constructor {};
+  template<> struct ac_ieee_float_constructor<binary16,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary16,double> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary32,float> {
+    typedef int type;
+  };
+  template<> struct ac_ieee_float_constructor<binary32,double> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary64,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary64,double> {
+    typedef int type;
+  };
+  template<> struct ac_ieee_float_constructor<binary128,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary128,double> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary256,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary256,double> {
+    typedef int type_explicit;
+  };
+}
+
+template<ac_ieee_float_format Format>
+class ac_ieee_float : public ac_ieee_float_base<Format> {
+public:
+  typedef ac_ieee_float_base<Format> Base;
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type mult;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type plus;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus2;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type logic;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div2;
+  };
+  struct rt_unary {
+    typedef ac_ieee_float neg;
+    typedef ac_ieee_float mag_sqr;
+    typedef ac_ieee_float mag;
+  };
+  static const int width = Base::width;
+  static const int e_width = Base::e_width;
+  static const int lls = width >> 6;
+  typedef typename Base::data_t data_t;
+  typedef typename Base::helper_t helper_t;
+  typedef typename Base::ac_float_t ac_float_t;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+public:
+  static ac_ieee_float nan() { return ac_ieee_float(helper_t::nan()); }
+  static ac_ieee_float inf() { return ac_ieee_float(helper_t::inf()); }
+  static ac_ieee_float denorm_min() { return ac_ieee_float(helper_t::denorm_min()); }
+  static ac_ieee_float min() { return ac_ieee_float(helper_t::min()); }
+  static ac_ieee_float max() { return ac_ieee_float(helper_t::max()); }
+  static ac_ieee_float epsilon() { return ac_ieee_float(helper_t::epsilon()); }
+  static ac_ieee_float zero() { return ac_ieee_float(ac_std_float_t::zero()); }
+  static ac_ieee_float one() { return ac_ieee_float(ac_std_float_t::one()); }
+  ac_ieee_float() {}
+private:
+  ac_ieee_float(const Base &f) : Base(f) {}
+public:
+  ac_ieee_float(const ac_std_float<width,e_width> &f) : Base(f) {}
+  ac_ieee_float(const ac_ieee_float &f) : Base(f) {}
+  template<ac_ieee_float_format Format2>
+  explicit ac_ieee_float(const ac_ieee_float<Format2> &f) : Base(ac_std_float_t(f.to_ac_std_float())) {}
+  template<int W, int E>
+  explicit ac_ieee_float(const ac_std_float<W,E> &f) : Base(ac_std_float_t(f)) {}
+  explicit ac_ieee_float(const ac::bfloat16 &f);
+  explicit ac_ieee_float(const ac_float_t &f) : Base(ac_std_float_t(f)) {}
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  explicit ac_ieee_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) : Base(ac_std_float_t(fx)) {}
+  template<ac_q_mode Q>
+  explicit ac_ieee_float(const ac_float<width-e_width+1,2,e_width,Q> &f) : Base(ac_std_float_t(f)) {}
+  template<ac_ieee_float_format Format2>
+  ac_ieee_float<Format2> to_ac_ieee_float() const { return ac_ieee_float<Format2>(*this); }
+  const ac_float_t to_ac_float() const {
+    return to_ac_std_float().to_ac_float();
+  }
+  const ac_std_float<width,e_width> to_ac_std_float() const {
+    ac_std_float<width,e_width> r;
+    r.set_data(data_ac_int());
+    return r;
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
+    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
+  }
+  void set_data(const data_t &data) {
+    Base::set_data(data);
+  }
+  const ac_int<width,true> data_ac_int() const { return Base::data_ac_int(); }
+  const data_t &data() const { return Base::d; }
+  template<typename T>
+  ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type d = 0) : Base(ac_std_float_t(f)) {}
+  template<typename T>
+  explicit ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type_explicit d = 0) : Base(ac_std_float_t(f)) {}
+  explicit ac_ieee_float(int x) {
+    *this = ac_ieee_float(ac_fixed<32,32,true>(x));
+  }
+  explicit ac_ieee_float(long long x) {
+    *this = ac_ieee_float(ac_fixed<64,64,true>(x));
+  }
+  int fpclassify() const { return Base::to_helper_t().fpclassify(); }
+  bool isfinite() const { return Base::to_helper_t().isfinite(); }
+  bool isnormal() const { return Base::to_helper_t().isnormal(); }
+  bool isinf() const { return Base::to_helper_t().isinf(); }
+  bool isnan() const { return Base::to_helper_t().isnan(); }
+
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float add(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template add<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float sub(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template sub<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float mult(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template mult<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float div(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template div<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float fma(const ac_ieee_float &op2, const ac_ieee_float &op3) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template fma<QR,No_SubNormals>(op2.Base::to_helper_t(), op3.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float sqrt() const {
+    return ac_ieee_float(Base(Base::to_helper_t().template sqrt<QR,No_SubNormals>()));
+  }
+
+  ac_ieee_float operator +(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() + op2.Base::to_helper_t()));
+  }
+  ac_ieee_float operator -(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() - op2.Base::to_helper_t()));
+  }
+  ac_ieee_float operator *(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() * op2.Base::to_helper_t()));
+  }
+  ac_ieee_float operator /(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() / op2.Base::to_helper_t()));
+  }
+
+  ac_ieee_float &operator +=(const ac_ieee_float &op2) {
+    return *this = operator +(op2);
+  }
+  ac_ieee_float &operator -=(const ac_ieee_float &op2) {
+    return *this = operator -(op2);
+  }
+  ac_ieee_float &operator *=(const ac_ieee_float &op2) {
+    return *this = operator *(op2);
+  }
+  ac_ieee_float &operator /=(const ac_ieee_float &op2) {
+    return *this = operator /(op2);
+  }
+
+  bool operator ==(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() == op2.Base::to_helper_t();
+  }
+  bool operator !=(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() != op2.Base::to_helper_t();
+  }
+  bool operator <(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() < op2.Base::to_helper_t();
+  }
+  bool operator >=(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() >= op2.Base::to_helper_t();
+  }
+  bool operator >(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() > op2.Base::to_helper_t();
+  }
+  bool operator <=(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() <= op2.Base::to_helper_t();
+  }
+
+  ac_ieee_float operator -() const {
+    ac_ieee_float r(*this);
+    r.set_signbit(!this->signbit());
+    return r;
+  }
+  ac_ieee_float operator +() const {
+    return ac_ieee_float(*this);
+  }
+  ac_ieee_float abs() const {
+    ac_ieee_float r(*this);
+    r.set_signbit(false);
+    return r;
+  }
+  ac_ieee_float copysign(const ac_ieee_float &op2) const {
+    ac_ieee_float r(*this);
+    r.set_signbit(this->signbit());
+    return r;
+  }
+  bool signbit() const { return Base::signbit(); }
+  ac_ieee_float add(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 + op2;
+  }
+  ac_ieee_float ceil() const {
+    return ac_ieee_float(Base(Base::to_helper_t().ceil()));
+  }
+  ac_ieee_float floor() const {
+    return ac_ieee_float(Base(Base::to_helper_t().floor()));
+  }
+  ac_ieee_float trunc() const {
+    return ac_ieee_float(Base(Base::to_helper_t().trunc()));
+  }
+  ac_ieee_float round() const {
+    return ac_ieee_float(Base(Base::to_helper_t().round()));
+  }
+  ac_ieee_float sub(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 - op2;
+  }
+  ac_ieee_float mult(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 * op2;
+  }
+  ac_ieee_float div(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 / op2;
+  }
+};
+
+template<ac_ieee_float_format Format>
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float<Format> &x) {
+  os << (const ac_ieee_float_base<Format>&) x;
+  return os;
+}
+
+namespace ac {
+class bfloat16 {
+public:
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type mult;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type plus;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus2;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type logic;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div2;
+  };
+  struct rt_unary {
+    typedef bfloat16 neg;
+    typedef bfloat16 mag_sqr;
+    typedef bfloat16 mag;
+  };
+  static const int width = 16;
+  static const int e_width = 8;
+  static bfloat16 nan() { return bfloat16(helper_t::nan()); }
+  static bfloat16 inf() { return bfloat16(helper_t::inf()); }
+  static bfloat16 denorm_min() { return bfloat16(helper_t::denorm_min()); }
+  static bfloat16 min() { return bfloat16(helper_t::min()); }
+  static bfloat16 max() { return bfloat16(helper_t::max()); }
+  static bfloat16 epsilon() { return bfloat16(helper_t::epsilon()); }
+  static bfloat16 zero() { return bfloat16(ac_std_float_t::zero()); }
+  static bfloat16 one() { return bfloat16(ac_std_float_t::one()); }
+  typedef ac_std_float<width,e_width> helper_t;
+  typedef short data_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+  data_t d;
+  bfloat16() {}
+  bfloat16(const bfloat16 &f) : d(f.d) {}
+  bfloat16(const ac_std_float_t &op) : d(op.data()) {}
+  bfloat16(float f) { int x; ac::copy_bits(f, &x); d = (short) (x >> 16); }
+  template<int W2>
+  explicit bfloat16(const ac_std_float<W2,e_width> &f) {
+    *this = f.template convert<width,AC_TRN_ZERO>();
+  }
+  template<int W2,int E2>
+  explicit bfloat16(const ac_std_float<W2,E2> &f) {
+    *this = f.template convert<width,e_width,AC_TRN_ZERO>();
+  }
+  template<ac_ieee_float_format Format>
+  explicit bfloat16(const ac_ieee_float<Format> &f) {
+    *this = f.to_ac_std_float().template convert<width,e_width,AC_TRN_ZERO>();
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  explicit bfloat16(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
+    ac_std_float_t x;
+    x.assign_from<AC_TRN_ZERO>(fx);
+    *this = x;
+  }
+private:
+  const helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+  const ac_std_float_t to_ac_std_float() const {
+    ac_std_float_t x;
+    x.set_data(d);
+    return x;
+  }
+  const ac_float_t to_ac_float() const {
+    return ac_std_float_t().to_ac_float();
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
+    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
+  }
+  float to_float() const {
+    return to_ac_std_float().to_float();
+  }
+  double to_double() const {
+    return to_ac_std_float().to_double();
+  }
+  // operator is efficient since E is identical and mantissa is longer
+#if __cplusplus > 199711L
+  explicit operator float() const { return to_float(); }
+#endif
+  int fpclassify() const { return to_helper_t().fpclassify(); }
+  bool isfinite() const { return to_helper_t().isfinite(); }
+  bool isnormal() const { return to_helper_t().isnormal(); }
+  bool isinf() const { return to_helper_t().isinf(); }
+  bool isnan() const { return to_helper_t().isnan(); }
+  void set_data(short op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<16,true> data_ac_int() const { return ac_int<16,true>(d); }
+
+  // mirroed most constructors in tensorflow implementation (except template version)
+  //   tensorflow uses static_cast<float>
+  //   this implementation goes through ac_std_float so there is no dependency on rounding mode
+//  template <class T>
+//  explicit bfloat16(const T& val) { *this = bfloat16(static_cast<float>(val)); }
+  explicit bfloat16(unsigned short val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<16,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(int val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<32,true>(val) );
+    *this = t;
+  }
+  explicit bfloat16(unsigned int val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<32,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(long val) {
+    const int long_w = ac_private::long_w;
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<long_w,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(long long val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<64,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(double val) { *this = bfloat16(ac_ieee_float<binary64>(val)); }
+
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 add(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().add<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 sub(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().sub<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 mult(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().mult<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 div(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().div<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 fma(const bfloat16 &op2, const bfloat16 &op3) const {
+    return bfloat16(to_helper_t().fma<QR,No_SubNormals>(op2.to_helper_t(), op3.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 sqrt() const {
+    return bfloat16(to_helper_t().sqrt<QR,No_SubNormals>());
+  }
+
+  bfloat16 operator +(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().add<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 operator -(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().sub<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 operator *(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().mult<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 operator /(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().div<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 &operator +=(const bfloat16 &op2) {
+    return *this = operator +(op2);
+  }
+  bfloat16 &operator -=(const bfloat16 &op2) {
+    return *this = operator -(op2);
+  }
+  bfloat16 &operator *=(const bfloat16 &op2) {
+    return *this = operator *(op2);
+  }
+  bfloat16 &operator /=(const bfloat16 &op2) {
+    return *this = operator /(op2);
+  }
+
+  bool operator ==(const bfloat16 &op2) const {
+    return to_helper_t() == op2.to_helper_t();
+  }
+  bool operator !=(const bfloat16 &op2) const {
+    return to_helper_t() != op2.to_helper_t();
+  }
+  bool operator <(const bfloat16 &op2) const {
+    return to_helper_t() < op2.to_helper_t();
+  }
+  bool operator >=(const bfloat16 &op2) const {
+    return to_helper_t() >= op2.to_helper_t();
+  }
+  bool operator >(const bfloat16 &op2) const {
+    return to_helper_t() > op2.to_helper_t();
+  }
+  bool operator <=(const bfloat16 &op2) const {
+    return to_helper_t() <= op2.to_helper_t();
+  }
+
+  bfloat16 operator -() const {
+    bfloat16 r(*this);
+    r.set_signbit(!this->signbit());
+    return r;
+  }
+  bfloat16 operator +() const {
+    return bfloat16(*this);
+  }
+  bfloat16 abs() const {
+    bfloat16 r(*this);
+    r.set_signbit(false);
+    return r;
+  }
+  bfloat16 copysign(const bfloat16 &op2) const {
+    bfloat16 r(*this);
+    r.set_signbit(this->signbit());
+    return r;
+  }
+  bool signbit() const { return d < 0; }
+  void set_signbit(bool s) {
+    ac_int<width,true> t(d);
+    t[width-1] = s;
+    d = t;
+  }
+  bfloat16 ceil() const { return to_helper_t().ceil(); }
+  bfloat16 floor() const { return to_helper_t().floor(); }
+  bfloat16 trunc() const { return to_helper_t().trunc(); }
+  bfloat16 round() const { return to_helper_t().round(); }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac::bfloat16 &x) {
+  os << x.to_float();
+  return os;
+}
+
+}
+
+template<int W, int E>
+template<ac_ieee_float_format Format>
+inline ac_std_float<W,E>::ac_std_float(const ac_ieee_float<Format> &f) {
+  *this = ac_std_float(f.to_ac_std_float());
+}
+
+template<int W, int E>
+inline ac_std_float<W,E>::ac_std_float(const ac::bfloat16 &f) {
+  *this = ac_std_float(f.to_ac_std_float());
+}
+
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format>::ac_ieee_float(const ac::bfloat16 &f) {
+  *this = ac_ieee_float(f.to_ac_std_float());
+}
+
+typedef ac_ieee_float<binary16> ac_ieee_float16;
+typedef ac_ieee_float<binary32> ac_ieee_float32;
+typedef ac_ieee_float<binary64> ac_ieee_float64;
+typedef ac_ieee_float<binary128> ac_ieee_float128;
+typedef ac_ieee_float<binary256> ac_ieee_float256;
+
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+// Global functions for ac_ieee_float
+namespace std {
+#ifdef __AC_NAMESPACE
+using namespace __AC_NAMESPACE;
+#endif
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format> abs(const ac_ieee_float<Format> &x) { return x.abs(); }
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format> fabs(const ac_ieee_float<Format> &x) { return x.abs(); }
+
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format> copysign(const ac_ieee_float<Format> &x, const ac_ieee_float<Format> &y) { return x.copysign(y); }
+
+template<ac_ieee_float_format Format>
+inline int fpclassify(const ac_ieee_float<Format> &x) { return x.fpclassify(); }
+template<ac_ieee_float_format Format>
+inline bool isfinite(const ac_ieee_float<Format> &x) { return x.isfinite(); }
+template<ac_ieee_float_format Format>
+inline bool isnormal(const ac_ieee_float<Format> &x) { return x.isnormal(); }
+template<ac_ieee_float_format Format>
+inline bool isinf(const ac_ieee_float<Format> &x) { return x.isinf(); }
+template<ac_ieee_float_format Format>
+inline bool isnan(const ac_ieee_float<Format> &x) { return x.isnan(); }
+
+// Don't do "long double" versions since they are 80-bits, it is an extended presicion
+// TODO: fmod, fmodf, fmodl
+// TODO: fmod, remainder, remquo, fma, fmax, fmin, fdim
+// remainder(x,y),  x - n*y, where n = x/y rounded to the nearest integer (RND_CONV)
+// remquo(x,y, int *quo),  returns same as remainder, unclear what quo is, also Nan, inf etc
+// fmax, fmin:  if one number is Nan, the other is returned
+// fdim(x,y) returns max(x-y,0), if x or y is NaN, a NaN is returned, if result overflows, HUGE_VAL is returned
+// TODO: ceil, floor, trunc, round, lround, nearbyint, rint, lrint, llround, llrint
+// if x is +0, -0, NaN or Inf, x is returned
+//   ceil(x), floor(x), trunc(x)
+//   round(x) : RND_INF
+//   nearbyint: depends on rounding mode
+//   rint, same as nearbyint, but may raise inexaxt exception (FE_INEXACT)
+// TODO: frexp, ldexp, modf, nextafter, nexttoward, copysign
+// modf(x, *iptr), modff   break into integral (*iptr) and fractional (returned) values,
+// Don't cause exception: isgreater, isgreaterequal, isless, islessequal, islessgreater, isunordered
+//  isunordered: x or y is NaN
+template<ac_ieee_float_format Format>
+inline bool signbit(const ac_ieee_float<Format> &x) { return x.signbit(); }
+
+// Global functions for bfloat16
+inline bool signbit(const ac::bfloat16 &x) { return x.signbit(); }
+
+inline int fpclassify(const ac::bfloat16 &x) { return x.fpclassify(); }
+inline bool isfinite(const ac::bfloat16 &x) { return x.isfinite(); }
+inline bool isnormal(const ac::bfloat16 &x) { return x.isnormal(); }
+inline bool isinf(const ac::bfloat16 &x) { return x.isinf(); }
+inline bool isnan(const ac::bfloat16 &x) { return x.isnan(); }
+}
+
+#undef __AC_DATA_PRIVATE
+#undef AC_STD_FLOAT_FX_DIV_OVERRIDE
+
+#endif
diff --git a/hls4ml/templates/quartus/ac_types/stream.h b/hls4ml/templates/quartus/ac_types/stream.h
index b19ad74d66..7084644994 100644
--- a/hls4ml/templates/quartus/ac_types/stream.h
+++ b/hls4ml/templates/quartus/ac_types/stream.h
@@ -1,36 +1,36 @@
-#ifndef NNET_STREAM_H
-#define NNET_STREAM_H
-
-#include <deque>
-
-namespace nnet {
-
-/*
-* A struct with the same high-level functionality as Intel's HLS ihc::stream
-* This struct is used during GCC compilation / hls4ml model.predict(...)
-* This is because GCC does not have access to HLS source files (ihc::stream)
-* Software-wise, this struct behaves like a first-in, first-out (FIFO) buffer
-* However, it cannot be used for HLS synthesis, since it uses dynamic memory allocation (deque)
-*/
-template<typename T>
-struct stream {
-  private:
-    std::deque<T> _data;
-
-  public:
-    stream() {}
-
-    T read() {
-        T element = _data.front();
-        _data.pop_front();
-        return element; 
-    }
-
-    void write(const T& element) { 
-        _data.push_back(element);
-    }   
-};
-
-}
- 
+#ifndef NNET_STREAM_H
+#define NNET_STREAM_H
+
+#include <deque>
+
+namespace nnet {
+
+/*
+* A struct with the same high-level functionality as Intel's HLS ihc::stream
+* This struct is used during GCC compilation / hls4ml model.predict(...)
+* This is because GCC does not have access to HLS source files (ihc::stream)
+* Software-wise, this struct behaves like a first-in, first-out (FIFO) buffer
+* However, it cannot be used for HLS synthesis, since it uses dynamic memory allocation (deque)
+*/
+template<typename T>
+struct stream {
+  private:
+    std::deque<T> _data;
+
+  public:
+    stream() {}
+
+    T read() {
+        T element = _data.front();
+        _data.pop_front();
+        return element; 
+    }
+
+    void write(const T& element) { 
+        _data.push_back(element);
+    }   
+};
+
+}
+ 
 #endif
\ No newline at end of file
diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h
index c3fe4ec402..49781dc963 100644
--- a/hls4ml/templates/quartus/firmware/defines.h
+++ b/hls4ml/templates/quartus/firmware/defines.h
@@ -1,47 +1,47 @@
-#ifndef DEFINES_H_
-#define DEFINES_H_
-
-/*
- * Intel HLS makes use of three streaming interfaces:
- *   (1) stream_in - used as the main input to a component
- *   (2) stream_out - used as the main output of a component
- *   (3) stream - allows both reading and writing; used for inter-component connections
- * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
- * Therefore, variables of type 'stream' are always passed by reference
- */
-
-#ifndef __INTELFPGA_COMPILER__
-
-#include "ac_fixed.h"
-#include "ac_int.h"
-#define hls_register
-
-#include "stream.h"
-template <typename T> using stream = nnet::stream<T>;
-template <typename T> using stream_in = nnet::stream<T>;
-template <typename T> using stream_out = nnet::stream<T>;
-
-#else
-
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/hls.h"
-
-template <typename T> using stream = ihc::stream<T>;
-template <typename T> using stream_in = ihc::stream_in<T>;
-template <typename T> using stream_out = ihc::stream_out<T>;
-
-#endif
-
-// Include nnet::array - a custom array-like struct, mainly used with io_stream
-#include "nnet_utils/nnet_types.h"
-
-// hls-fpga-machine-learning insert numbers
-
-// hls-fpga-machine-learning insert layer-precision
-
-#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
-#define MIN(n, d) (n > d ? d : n)
-#define MAX(n, d) (n < d ? d : n)
-
-#endif
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+/*
+ * Intel HLS makes use of three streaming interfaces:
+ *   (1) stream_in - used as the main input to a component
+ *   (2) stream_out - used as the main output of a component
+ *   (3) stream - allows both reading and writing; used for inter-component connections
+ * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
+ * Therefore, variables of type 'stream' are always passed by reference
+ */
+
+#ifndef __INTELFPGA_COMPILER__
+
+#include "ac_fixed.h"
+#include "ac_int.h"
+#define hls_register
+
+#include "stream.h"
+template <typename T> using stream = nnet::stream<T>;
+template <typename T> using stream_in = nnet::stream<T>;
+template <typename T> using stream_out = nnet::stream<T>;
+
+#else
+
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
+
+template <typename T> using stream = ihc::stream<T>;
+template <typename T> using stream_in = ihc::stream_in<T>;
+template <typename T> using stream_out = ihc::stream_out<T>;
+
+#endif
+
+// Include nnet::array - a custom array-like struct, mainly used with io_stream
+#include "nnet_utils/nnet_types.h"
+
+// hls-fpga-machine-learning insert numbers
+
+// hls-fpga-machine-learning insert layer-precision
+
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n < d ? d : n)
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/myproject.cpp b/hls4ml/templates/quartus/firmware/myproject.cpp
index acdde092ff..3f5749d611 100644
--- a/hls4ml/templates/quartus/firmware/myproject.cpp
+++ b/hls4ml/templates/quartus/firmware/myproject.cpp
@@ -1,48 +1,48 @@
-#include "myproject.h"
-#include "parameters.h"
-
-// hls-fpga-machine-learning insert weights
-
-/*
- * Intel HLS requires that all 'stream' types are:
- *     (1) Passed by reference to the top-level entity or
- *     (2) Declared as global variables, outside of the main function
- * Therefore, layer inputs/output (connections betweenn individual layers) are declared here
- */
-// hls-fpga-machine-learning insert inter-task streams
-
-#ifndef __INTELFPGA_COMPILER__
-/*
-* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
-* An important distinction is made between io_stream and io_parallel:
-*     (1) io_parallel:
-               - Top-level function takes a struct containing an array as function argument
-               - Returns a struct containing an array - the prediction
-      (2) io_stream:
-               - Top-level function is 'void' - no return value
-               - Instead, both the input and output are passed by reference
-               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
-* This distinction is handled in quartus_writer.py
-*/
-// hls-fpga-machine-learning instantiate GCC top-level
-#else
-// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-// hls-fpga-machine-learning insert cpragmas
-
-/*
- * The top-level function used during HLS Synthesis goes here
- * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
- */
-// hls-fpga-machine-learning instantiate HLS top-level
-#endif
-// If using io_parallel, the output needs to be initialised and returned at the end of this function
-// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
-// hls-fpga-machine-learning initialize input/output
-
-// ****************************************
-// NETWORK INSTANTIATION
-// ****************************************
-
-// hls-fpga-machine-learning insert layers
-
-// hls-fpga-machine-learning return
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert weights
+
+/*
+ * Intel HLS requires that all 'stream' types are:
+ *     (1) Passed by reference to the top-level entity or
+ *     (2) Declared as global variables, outside of the main function
+ * Therefore, layer inputs/output (connections betweenn individual layers) are declared here
+ */
+// hls-fpga-machine-learning insert inter-task streams
+
+#ifndef __INTELFPGA_COMPILER__
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+// hls-fpga-machine-learning instantiate GCC top-level
+#else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
+// hls-fpga-machine-learning insert cpragmas
+
+/*
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
+#endif
+// If using io_parallel, the output needs to be initialised and returned at the end of this function
+// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
+// hls-fpga-machine-learning initialize input/output
+
+// ****************************************
+// NETWORK INSTANTIATION
+// ****************************************
+
+// hls-fpga-machine-learning insert layers
+
+// hls-fpga-machine-learning return
diff --git a/hls4ml/templates/quartus/firmware/myproject.h b/hls4ml/templates/quartus/firmware/myproject.h
index d0f577d14d..afb7020671 100644
--- a/hls4ml/templates/quartus/firmware/myproject.h
+++ b/hls4ml/templates/quartus/firmware/myproject.h
@@ -1,48 +1,48 @@
-#ifndef MYPROJECT_H_
-#define MYPROJECT_H_
-
-#ifndef __INTELFPGA_COMPILER__
-#include "ac_fixed.h"
-#include "ac_int.h"
-#define hls_register
-#else
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/hls.h"
-#endif
-
-// Streams are explicitly defined in defines.h, which are included for parameters.h
-// Defining them again in this file will cause compile-time errors
-#include "defines.h"
-
-// If using io_parallel, inputs and output need to be initialised before calling the top-level function
-// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
-// hls-fpga-machine-learning insert inputs
-// hls-fpga-machine-learning insert outputs
-
-#ifndef __INTELFPGA_COMPILER__
-/*
-* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
-* An important distinction is made between io_stream and io_parallel:
-*     (1) io_parallel:
-               - Top-level function takes a struct containing an array as function argument
-               - Returns a struct containing an array - the prediction
-      (2) io_stream:
-               - Top-level function is 'void' - no return value
-               - Instead, both the input and output are passed by reference
-               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
-* This distinction is handled in quartus_writer.py
-*/
-// hls-fpga-machine-learning instantiate GCC top-level
-#else
-// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-// hls-fpga-machine-learning insert cpragmas
-
-/*
- * The top-level function used during HLS Synthesis goes here
- * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
- */
-// hls-fpga-machine-learning instantiate HLS top-level
-#endif
-
-#endif
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#ifndef __INTELFPGA_COMPILER__
+#include "ac_fixed.h"
+#include "ac_int.h"
+#define hls_register
+#else
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
+#endif
+
+// Streams are explicitly defined in defines.h, which are included for parameters.h
+// Defining them again in this file will cause compile-time errors
+#include "defines.h"
+
+// If using io_parallel, inputs and output need to be initialised before calling the top-level function
+// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
+// hls-fpga-machine-learning insert inputs
+// hls-fpga-machine-learning insert outputs
+
+#ifndef __INTELFPGA_COMPILER__
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+// hls-fpga-machine-learning instantiate GCC top-level
+#else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
+// hls-fpga-machine-learning insert cpragmas
+
+/*
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
+#endif
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h
index 7b84a9c0f2..cda8e748a1 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h
@@ -1,104 +1,104 @@
-#ifndef NNET_BATCHNORM_H_
-#define NNET_BATCHNORM_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct batchnorm_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-
-    // Default multiplication
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-// Calcuate result
-Result:
-    #pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
-        if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
-                        bias[ires];
-        } else {
-            int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
-                bias[norm_index];
-        }
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-struct batchnorm_quantized_tanh_config {
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-};
-
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
-                           const data_T threshold[CONFIG_T::n_scale_bias]) {
-    #pragma unroll
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        ac_int<1, false> cache;
-        data_T datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg >= threshold[norm_index])
-            cache = 1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
-                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
-                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
-    #pragma unroll
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        ac_int<2, true> cache;
-        data_T datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg > threshold_hi[norm_index])
-            cache = 1;
-        else if (datareg <= threshold_lo[norm_index])
-            cache = -1;
-        else
-            cache = 0;
-        res[ii] = cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+// Calcuate result
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
+                           const data_T threshold[CONFIG_T::n_scale_bias]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<1, false> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
+                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
+                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<2, true> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
index 1af60ab0c5..5e5c1fa24d 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
@@ -1,71 +1,71 @@
-#ifndef NNET_COMMON_H_
-#define NNET_COMMON_H_
-
-#ifndef __INTELFPGA_COMPILER__
-#include "ac_fixed.h"
-#include "ac_int.h"
-#include "math.h"
-#else
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/math.h"
-#endif
-
-#include "nnet_helpers.h"
-
-typedef ac_fixed<16, 6> table_default_t;
-
-namespace nnet {
-
-// Common type definitions
-enum io_type { io_parallel = 0, io_stream };
-
-// Default data types (??) TODO: Deprecate
-typedef ac_fixed<16, 4> weight_t_def;
-typedef ac_fixed<16, 4> bias_t_def;
-typedef ac_fixed<32, 10> accum_t_def;
-
-template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
-    #pragma unroll
-    for (int ii = 0; ii < NIN1; ii++) {
-        res[ii] = data1[ii];
-    }
-    #pragma unroll
-    for (int ii = 0; ii < NIN2; ii++) {
-        res[NIN1 + ii] = data2[ii];
-    }
-}
-
-/* ---
- * Balanced tree reduce implementation.
- * For use in scenarios where Quartus cannot expression balance
- * Reduces an array of inputs to a single value using the template binary operator 'Op',
- * for example summing all elements with Op_add, or finding the maximum with Op_max
- * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
- * before applying and accumulate the result over the rolled dimension.
- * --- */
-template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
-    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
-    if (N == 1) {
-        return x[0];
-    }
-    if (N == 2) {
-        return op(x[0], x[1]);
-    }
-    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
-}
-
-template <class T> class Op_add {
-  public:
-    T operator()(T a, T b) { return a + b; }
-};
-
-template <class T> class Op_max {
-  public:
-    T operator()(T a, T b) { return a >= b ? a : b; }
-};
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#ifndef __INTELFPGA_COMPILER__
+#include "ac_fixed.h"
+#include "ac_int.h"
+#include "math.h"
+#else
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/math.h"
+#endif
+
+#include "nnet_helpers.h"
+
+typedef ac_fixed<16, 6> table_default_t;
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+
+// Default data types (??) TODO: Deprecate
+typedef ac_fixed<16, 4> weight_t_def;
+typedef ac_fixed<16, 4> bias_t_def;
+typedef ac_fixed<32, 10> accum_t_def;
+
+template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
+    #pragma unroll
+    for (int ii = 0; ii < NIN1; ii++) {
+        res[ii] = data1[ii];
+    }
+    #pragma unroll
+    for (int ii = 0; ii < NIN2; ii++) {
+        res[NIN1 + ii] = data2[ii];
+    }
+}
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Quartus cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
index 8897e13150..579606519f 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
@@ -1,64 +1,64 @@
-#ifndef NNET_CONV1D_H_
-#define NNET_CONV1D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv1d_resource.h"
-
-namespace nnet {
-
-struct conv1d_config {
-    // I/O sizes
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-
-    // Number of channels, filters
-    static const unsigned n_chan = 1;
-    static const unsigned n_filt = 1;
-
-    // Original filter size
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_width;
-
-    // Modified filter size (post-Wionograd transformation, if applied)
-    static const unsigned impl_filt_height = 1;
-    static const unsigned impl_filt_width = 1;
-
-    // Padding, stride, dilation
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned stride_width = 1;
-    static const unsigned dilation = 1;
-
-    // Run-time Configuration
-    static const unsigned n_zeros = 0;
-    static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
-
-    // TODO: BRAM Storage on Quartus
-    static const bool store_weights_in_bram = false;
-
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+
+namespace nnet {
+
+struct conv1d_config {
+    // I/O sizes
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+
+    // Number of channels, filters
+    static const unsigned n_chan = 1;
+    static const unsigned n_filt = 1;
+
+    // Original filter size
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+
+    // Modified filter size (post-Wionograd transformation, if applied)
+    static const unsigned impl_filt_height = 1;
+    static const unsigned impl_filt_width = 1;
+
+    // Padding, stride, dilation
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+
+    // Run-time Configuration
+    static const unsigned n_zeros = 0;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelisation_factor = 1;
+
+    // TODO: BRAM Storage on Quartus
+    static const bool store_weights_in_bram = false;
+
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
index aba0803989..d969403c3e 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
@@ -1,169 +1,169 @@
-#ifndef NNET_DENSE_LARGE_H_
-#define NNET_DENSE_LARGE_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct dense_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 10;
-
-    static const unsigned reuse_factor = 1;
-    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
-    static const unsigned multiplier_factor = 1; // min n_in, rf
-    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
-    static const unsigned reciprocal = 1;        // 2^35 / 25
-    static const unsigned rf_pad = 0;
-    static const unsigned bf_pad = 0;
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-
-    // Default multiplication
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
-           "The current Reuse Factor is not allowed");
-    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
-    //#pragma ii CONFIG_T::reuse_factor
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-Load:
-    #pragma unroll
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
-    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
-
-    #pragma unroll
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + CONFIG_T::reuse_factor * im;
-            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor).to_int();
-            d_index[ir][im] = w_index % CONFIG_T::n_in;
-        }
-    }
-Product1:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
-    Product2:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
-            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
-                continue;
-            int data_index = d_index[ir][im];
-            // Modified this
-            tmp_acc[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
-        }
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
-    ResetMult:
-        #pragma unroll
-        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
-            mult[imult] = 0;
-        }
-    AccumLoop1:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            int o_index = out_index[ir][im];
-            if (o_index >= CONFIG_T::n_out)
-                continue; // check out of bounds
-            mult[o_index] += tmp_acc[im];
-        }
-    AccumLoop2:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
-            acc[im] += mult[im];
-        }
-    }
-Store:
-    #pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
-    }
-}
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
-           "The current Reuse Factor is not allowed");
-    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
-
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-InitAccum:
-    #pragma unroll
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-ReuseLoop:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
-    MultLoop:
-        #pragma unroll
-        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
-            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
-                continue;
-            // Modified this
-            mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
-            in_index += CONFIG_T::reuse_factor;
-            if (in_index >= CONFIG_T::n_in)
-                in_index = ir;
-        }
-    AccumLoop:
-        #pragma unroll
-        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
-            acc[out_index] += mult[im];
-            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
-                acc_step = 0;
-                out_index++;
-            } else {
-                acc_step++;
-            }
-        }
-    }
-// Cast to "res_t" type
-Result:
-    #pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(
-    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-} // namespace nnet
-#endif
+#ifndef NNET_DENSE_LARGE_H_
+#define NNET_DENSE_LARGE_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    static const unsigned reuse_factor = 1;
+    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
+    static const unsigned multiplier_factor = 1; // min n_in, rf
+    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
+    static const unsigned reciprocal = 1;        // 2^35 / 25
+    static const unsigned rf_pad = 0;
+    static const unsigned bf_pad = 0;
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_gt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
+    //#pragma ii CONFIG_T::reuse_factor
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+Load:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+
+    #pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + CONFIG_T::reuse_factor * im;
+            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor).to_int();
+            d_index[ir][im] = w_index % CONFIG_T::n_in;
+        }
+    }
+Product1:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
+    Product2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
+                continue;
+            int data_index = d_index[ir][im];
+            // Modified this
+            tmp_acc[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
+        }
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
+    ResetMult:
+        #pragma unroll
+        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
+            mult[imult] = 0;
+        }
+    AccumLoop1:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            int o_index = out_index[ir][im];
+            if (o_index >= CONFIG_T::n_out)
+                continue; // check out of bounds
+            mult[o_index] += tmp_acc[im];
+        }
+    AccumLoop2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
+            acc[im] += mult[im];
+        }
+    }
+Store:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_lt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+InitAccum:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+ReuseLoop:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
+    MultLoop:
+        #pragma unroll
+        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue;
+            // Modified this
+            mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+            in_index += CONFIG_T::reuse_factor;
+            if (in_index >= CONFIG_T::n_in)
+                in_index = ir;
+        }
+    AccumLoop:
+        #pragma unroll
+        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
+            acc[out_index] += mult[im];
+            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+// Cast to "res_t" type
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(
+    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h
index 5619e299fb..ff261482ba 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h
@@ -1,80 +1,80 @@
-#ifndef NNET_COMPRESSED_LAYER_H_
-#define NNET_COMPRESSED_LAYER_H_
-
-#include "nnet_common.h"
-#include "nnet_dense.h"
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
-                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-
-InitAccum:
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_out; i++) {
-        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
-    }
-
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-
-    #pragma unroll
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32 w = ir + CONFIG_T::reuse_factor * im;
-            inputs[ir][im] = data[weights[w].row_index];
-            out_index[ir][im] = weights[w].col_index;
-        }
-    }
-ReuseLoop:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
-    CompressedMultLoop:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32 w = ir + CONFIG_T::reuse_factor * im;
-            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
-            typename CONFIG_T::accum_t prod = mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
-            #pragma unroll
-            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
-                inputs[is][im] = inputs[is + 1][im];
-            }
-        }
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
-    ResetMult:
-        #pragma unroll
-        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
-            tmp_acc[tacc] = 0;
-        }
-    AccumLoop1:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            int col = out_index[ir][im];
-            tmp_acc[col] += mult[im];
-        }
-    AccumLoop2:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::n_out; im++) {
-            acc[im] += tmp_acc[im];
-        }
-    }
-
-// Cast to "res_t" type
-ResultLoop:
-    #pragma unroll
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+InitAccum:
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_out; i++) {
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+
+    #pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32 w = ir + CONFIG_T::reuse_factor * im;
+            inputs[ir][im] = data[weights[w].row_index];
+            out_index[ir][im] = weights[w].col_index;
+        }
+    }
+ReuseLoop:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
+    CompressedMultLoop:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32 w = ir + CONFIG_T::reuse_factor * im;
+            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
+            typename CONFIG_T::accum_t prod = mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
+            #pragma unroll
+            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
+                inputs[is][im] = inputs[is + 1][im];
+            }
+        }
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
+    ResetMult:
+        #pragma unroll
+        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
+            tmp_acc[tacc] = 0;
+        }
+    AccumLoop1:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            int col = out_index[ir][im];
+            tmp_acc[col] += mult[im];
+        }
+    AccumLoop2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += tmp_acc[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
index 3bd78c7a84..775303e267 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
@@ -1,140 +1,140 @@
-#ifndef NNET_HELPERS_H
-#define NNET_HELPERS_H
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <sstream>
-#include <stdio.h>
-#include <stdlib.h>
-
-namespace nnet {
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = dstType(src[i]);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = static_cast<dstType>(src[i].to_double());
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, stream_in<dstType> &dst) {
-    for (size_t i = 0; i < SIZE / dstType::size; i++) {
-        dstType ctype;
-        for (size_t j = 0; j < dstType::size; j++) {
-            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
-        }
-        dst.write(ctype);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data_back(stream_out<srcType> &src, dstType *dst) {
-    for (size_t i = 0; i < SIZE / srcType::size; i++) {
-        srcType ctype = src.read();
-        for (size_t j = 0; j < srcType::size; j++) {
-            dst[i * srcType::size + j] = dstType(ctype[j].to_double());
-        }
-    }
-}
-
-extern bool trace_enabled;
-extern std::map<std::string, void *> *trace_outputs;
-extern size_t trace_type_size;
-
-constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
-
-constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
-
-constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
-
-template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
-    for (int i = 0; i < layer_size; i++) {
-        ptr[i] = static_cast<save_T>(data[i].to_double());
-    }
-}
-
-template <class data_T, class save_T> void save_output_array(stream<data_T> &data, save_T *ptr, size_t layer_size) {
-    for (size_t i = 0; i < layer_size / data_T::size; i++) {
-        data_T ctype = data.read();
-        for (size_t j = 0; j < data_T::size; j++) {
-            ptr[i * data_T::size + j] = static_cast<save_T>(ctype[j].to_double());
-        }
-        data.write(ctype);
-    }
-}
-
-// We don't want to include save_T in this function because it will be inserted into myproject.cpp
-// so a workaround with element size is used
-template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (int i = 0; i < layer_size; i++) {
-            out << data[i] << " "; // We don't care about precision in text files
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-template <class data_T> void save_layer_output(stream<data_T> &data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (size_t i = 0; i < layer_size / data_T::size; i++) {
-            data_T ctype = data.read();
-            for (size_t j = 0; j < data_T::size; j++) {
-                out << ctype[j] << " ";
-            }
-            data.write(ctype);
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace nnet {
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = static_cast<dstType>(src[i].to_double());
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, stream_in<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data_back(stream_out<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j].to_double());
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = static_cast<save_T>(data[i].to_double());
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(stream<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = static_cast<save_T>(ctype[j].to_double());
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << data[i] << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(stream<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << ctype[j] << " ";
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
index 766ef2e208..dc27de99ff 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
@@ -1,249 +1,249 @@
-#ifndef NNET_MERGE_H_
-#define NNET_MERGE_H_
-
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct merge_config {
-    static const unsigned n_elem = 10;
-};
-
-struct dot_config {
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 1;
-
-    static const unsigned reuse_factor = 1;
-
-    typedef float accum_t;
-
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-struct concat_config {
-    static const unsigned n_elem1_0 = 10;
-    static const unsigned n_elem1_1 = 10;
-    static const unsigned n_elem1_2 = 10;
-    static const unsigned n_elem2_0 = 10;
-    static const unsigned n_elem2_1 = 10;
-    static const unsigned n_elem2_2 = 10;
-
-    static const unsigned axis = -1;
-};
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] + data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] - data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] * data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-
-    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
-Product:
-    #pragma unroll multiplier_limit
-    for (int i = 0; i < CONFIG_T::n_in; i++) {
-        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
-    }
-
-    hls_register typename CONFIG_T::accum_t acc = 0;
-Accum:
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_in; i++) {
-        acc += mult[i];
-    }
-
-    res[0] = static_cast<res_T>(acc);
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
-                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
-                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
-        }
-
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
-                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx =
-                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
-            }
-        }
-
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
-                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
-                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
-                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
-                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
-            }
-
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
-                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+
+    static const unsigned reuse_factor = 1;
+
+    typedef float accum_t;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] + data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] - data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] * data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+
+    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+Product:
+    #pragma unroll multiplier_limit
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
+    }
+
+    hls_register typename CONFIG_T::accum_t acc = 0;
+Accum:
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        acc += mult[i];
+    }
+
+    res[0] = static_cast<res_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
+                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
+                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx =
+                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+        }
+
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
+                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h
index 5be7728323..6819684f2a 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h
@@ -1,113 +1,113 @@
-#ifndef NNET_MULT_H_
-#define NNET_MULT_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <math.h>
-
-namespace nnet {
-
-//  Different methods to perform the product of input and weight, depending on their types.
-namespace product {
-
-class Product {
-  public:
-    static void limit(unsigned multiplier_limit) {}
-};
-
-template <class x_T, class w_T> class both_binary : public Product {
-  public:
-    inline static x_T product(x_T a, w_T w) {
-        // specialisation for 1-bit weights and incoming data
-        return a == w;
-    }
-};
-
-template <class x_T, class w_T> class weight_binary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 1-bit weights, arbitrary data
-        if (w == 0)
-            return -a;
-        else
-            return a;
-    }
-};
-
-template <class x_T, class w_T> class data_binary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-w) {
-        // Specialisation for 1-bit data, arbitrary weight
-        if (a == 0)
-            return -w;
-        else
-            return w;
-    }
-};
-
-template <class x_T, class w_T> class weight_ternary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 2-bit weights, arbitrary data
-        if (w == 0)
-            return 0;
-        else if (w == -1)
-            return -a;
-        else
-            return a; // if(w == 1)
-    }
-};
-
-template <class x_T, class w_T> class mult : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
-        // 'Normal' product
-        return a * w;
-    }
-    static void limit(unsigned multiplier_limit) {
-        // TODO: Implement for Quartus
-        // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
-        // pragma
-    }
-};
-
-template <class x_T, class w_T> class weight_exponential : public Product {
-  public:
-    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
-    inline static r_T product(x_T a, w_T w) {
-        // Shift product for exponential weights
-        // Shift by the exponent. Negative weights shift right
-        r_T y = static_cast<r_T>(a) << w.weight;
-
-        // Negate or not depending on weight sign
-        return w.sign == 1 ? y : static_cast<r_T>(-y);
-    }
-};
-} // namespace product
-
-// TO-DO: These may need extra variants if ac_int types are used in more places
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
-                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
-                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
-                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
-                               res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<res_T>(x);
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<res_T>(x);
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+//  Different methods to perform the product of input and weight, depending on their types.
+namespace product {
+
+class Product {
+  public:
+    static void limit(unsigned multiplier_limit) {}
+};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    inline static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        return a * w;
+    }
+    static void limit(unsigned multiplier_limit) {
+        // TODO: Implement for Quartus
+        // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
+        // pragma
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
+    inline static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+} // namespace product
+
+// TO-DO: These may need extra variants if ac_int types are used in more places
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h
index a95f9ab003..498cebf520 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h
@@ -1,99 +1,99 @@
-#ifndef NNET_PADDING_H_
-#define NNET_PADDING_H_
-
-namespace nnet {
-
-struct padding1d_config {
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-    static const unsigned n_chan = 10;
-
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    for (int i = 0; i < CONFIG_T::pad_left; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_width; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = (res_T) * (data++);
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_right; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-struct padding2d_config {
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-
-    static const unsigned n_chan = 10;
-
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    for (int i = 0; i < CONFIG_T::pad_top; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_height; i++) {
-        for (int j = 0; j < CONFIG_T::pad_left; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-        for (int j = 0; j < CONFIG_T::in_width; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = (res_T) * (data++);
-            }
-        }
-        for (int j = 0; j < CONFIG_T::pad_right; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/myproject_test_parallel.cpp b/hls4ml/templates/quartus/myproject_test_parallel.cpp
index 5e3dd96c12..4de819eb49 100644
--- a/hls4ml/templates/quartus/myproject_test_parallel.cpp
+++ b/hls4ml/templates/quartus/myproject_test_parallel.cpp
@@ -1,112 +1,112 @@
-#include <algorithm>
-#include <cctype>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "firmware/myproject.h"
-#include "firmware/parameters.h"
-
-// hls-fpga-machine-learning insert bram
-
-#define CHECKPOINT 5000
-
-// This function is written to avoid stringstream, which is
-// not supported in cosim 20.1, and because strtok
-// requires a const_cast or allocation to use with std::strings.
-// This function returns the next float (by argument) at position pos,
-// updating pos. True is returned if conversion done, false if the string
-// has ended, and std::invalid_argument exception if the sting was bad.
-bool nextToken(const std::string &str, std::size_t &pos, float &val) {
-    while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
-        pos++;
-    }
-    if (pos >= str.size()) {
-        return false;
-    }
-    std::size_t offset = 0;
-    val = std::stof(str.substr(pos), &offset);
-    pos += offset;
-    return true;
-}
-
-int main(int argc, char **argv) {
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
-    std::string RESULTS_LOG = "tb_data/results.log";
-    std::ofstream fout(RESULTS_LOG);
-
-    std::string iline;
-    std::string pline;
-
-    std::vector<input_data> inputs;
-    std::vector<output_data> outputs;
-
-    if (fin.is_open() && fpr.is_open()) {
-        std::vector<std::vector<float>> predictions;
-        unsigned int num_iterations = 0;
-        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
-            if (num_iterations % CHECKPOINT == 0) {
-                std::cout << "Processing input " << num_iterations << std::endl;
-            }
-
-            std::vector<float> in;
-            std::vector<float> pr;
-            float current;
-
-            std::size_t pos = 0;
-            while (nextToken(iline, pos, current)) {
-                in.push_back(current);
-            }
-
-            pos = 0;
-            while (nextToken(pline, pos, current)) {
-                pr.push_back(current);
-            }
-
-            // hls-fpga-machine-learning insert data
-            predictions.push_back(std::move(pr));
-        }
-
-        // Do this separately to avoid vector reallocation
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert run
-
-        for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert tb-output
-            if (j % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-            }
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        const unsigned int num_iterations = 10;
-        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
-                  << " invocations." << std::endl;
-        // hls-fpga-machine-learning insert zero
-
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert run
-
-        for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert output
-
-            // hls-fpga-machine-learning insert tb-output
-        }
-    }
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
-    return 0;
-}
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/parameters.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+// This function is written to avoid stringstream, which is
+// not supported in cosim 20.1, and because strtok
+// requires a const_cast or allocation to use with std::strings.
+// This function returns the next float (by argument) at position pos,
+// updating pos. True is returned if conversion done, false if the string
+// has ended, and std::invalid_argument exception if the sting was bad.
+bool nextToken(const std::string &str, std::size_t &pos, float &val) {
+    while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
+        pos++;
+    }
+    if (pos >= str.size()) {
+        return false;
+    }
+    std::size_t offset = 0;
+    val = std::stof(str.substr(pos), &offset);
+    pos += offset;
+    return true;
+}
+
+int main(int argc, char **argv) {
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+    std::string RESULTS_LOG = "tb_data/results.log";
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+
+    std::vector<input_data> inputs;
+    std::vector<output_data> outputs;
+
+    if (fin.is_open() && fpr.is_open()) {
+        std::vector<std::vector<float>> predictions;
+        unsigned int num_iterations = 0;
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
+            if (num_iterations % CHECKPOINT == 0) {
+                std::cout << "Processing input " << num_iterations << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::size_t pos = 0;
+            while (nextToken(iline, pos, current)) {
+                in.push_back(current);
+            }
+
+            pos = 0;
+            while (nextToken(pline, pos, current)) {
+                pr.push_back(current);
+            }
+
+            // hls-fpga-machine-learning insert data
+            predictions.push_back(std::move(pr));
+        }
+
+        // Do this separately to avoid vector reallocation
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert run
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert tb-output
+            if (j % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        const unsigned int num_iterations = 10;
+        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                  << " invocations." << std::endl;
+        // hls-fpga-machine-learning insert zero
+
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert run
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert output
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/vivado/#vivado_synth.tcl# b/hls4ml/templates/vivado/#vivado_synth.tcl#
index 96bd21c672..fba1387c5a 100644
--- a/hls4ml/templates/vivado/#vivado_synth.tcl#
+++ b/hls4ml/templates/vivado/#vivado_synth.tcl#
@@ -1,6 +1,6 @@
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-add_files ${project_name}_prj/solution1/syn/verilog
-synth_design -top ${project_name} -part $part
-report_utilization -file vivado_synth.rpt
\ No newline at end of file
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+add_files ${project_name}_prj/solution1/syn/verilog
+synth_design -top ${project_name} -part $part
+report_utilization -file vivado_synth.rpt
diff --git a/hls4ml/templates/vivado/ap_types/ap_common.h b/hls4ml/templates/vivado/ap_types/ap_common.h
index 4d2886cbde..02575e87c1 100644
--- a/hls4ml/templates/vivado/ap_types/ap_common.h
+++ b/hls4ml/templates/vivado/ap_types/ap_common.h
@@ -1,376 +1,376 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_COMMON_H__
-#define __AP_COMMON_H__
-
-// ----------------------------------------------------------------------
-
-// Forward declaration of all AP types.
-#include <ap_decl.h>
-
-
-#ifdef __SYNTHESIS__
-#error "The open-source version of AP types does not support synthesis."
-#endif // ifdef __SYNTHESIS__
-#define _AP_ENABLE_HALF_ 0
-
-
-#if _AP_ENABLE_HALF_ == 1
-// Before ap_private definition.
-#ifdef __SYNTHESIS__
-#define _HLS_HALF_DEFINED_
-typedef __fp16 half;
-#else
-class half;
-#endif // __SYNTHESIS__
-#endif // _AP_ENABLE_HALF_
-
-// ----------------------------------------------------------------------
-
-// Macro functions
-#define AP_MAX(a, b) ((a) > (b) ? (a) : (b))
-#define AP_MIN(a, b) ((a) < (b) ? (a) : (b))
-#define AP_ABS(a) ((a) >= 0 ? (a) : -(a))
-
-#ifndef AP_ASSERT
-#ifndef __SYNTHESIS__
-#include <assert.h>
-#define AP_ASSERT(cond, msg) assert((cond) && (msg))
-#else
-#define AP_ASSERT(cond, msg)
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_ASSERT
-
-#ifndef __SYNTHESIS__
-// for fprintf messages.
-#include <stdio.h>
-// for exit on error.
-#include <stdlib.h>
-#endif
-
-// same disable condition as assert.
-#if !defined(__SYNTHESIS__) && !defined(NDEBUG)
-
-#define _AP_DEBUG(cond, ...)                  \
-  do {                                        \
-    if ((cond)) {                             \
-      fprintf(stderr, "DEBUG: " __VA_ARGS__); \
-      fprintf(stderr, "\n");                  \
-    }                                         \
-  } while (0)
-#define _AP_WARNING(cond, ...)                  \
-  do {                                          \
-    if ((cond)) {                               \
-      fprintf(stderr, "WARNING: " __VA_ARGS__); \
-      fprintf(stderr, "\n");                    \
-    }                                           \
-  } while (0)
-#define _AP_ERROR(cond, ...)                  \
-  do {                                        \
-    if ((cond)) {                             \
-      fprintf(stderr, "ERROR: " __VA_ARGS__); \
-      fprintf(stderr, "\n");                  \
-      abort();                                \
-    }                                         \
-  } while (0)
-
-#else // if !defined(__SYNTHESIS__) && !defined(NDEBUG)
-
-#define __AP_VOID_CAST static_cast<void>
-#define _AP_DEBUG(cond, ...) (__AP_VOID_CAST(0))
-#define _AP_WARNING(cond, ...) (__AP_VOID_CAST(0))
-#define _AP_ERROR(cond, ...) (__AP_VOID_CAST(0))
-
-#endif // if !defined(__SYNTHESIS__) && !defined(NDEBUG) else
-
-// ----------------------------------------------------------------------
-
-// Attribute only for synthesis
-#ifdef __SYNTHESIS__
-#define INLINE inline __attribute__((always_inline))
-//#define INLINE inline __attribute__((noinline))
-#else
-#define INLINE inline
-#endif
-
-#define AP_WEAK
-// __attribute__((weak))
-
-#ifndef AP_INT_MAX_W
-#define AP_INT_MAX_W 1024
-#endif
-
-#define BIT_WIDTH_UPPER_LIMIT (1 << 15)
-#if AP_INT_MAX_W > BIT_WIDTH_UPPER_LIMIT
-#error "Bitwidth exceeds 32768 (1 << 15), the maximum allowed value"
-#endif
-
-#define MAX_MODE(BITS) ((BITS + 1023) / 1024)
-
-// ----------------------------------------------------------------------
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-// for overload operator<<
-#include <iostream>
-#endif
-#endif // ifndef AP_AUTOCC
-
-#ifndef __SYNTHESIS__
-// for string format.
-#include <sstream>
-// for string.
-#include <string>
-#endif
-
-// for detecting if char is signed.
-enum { CHAR_IS_SIGNED = (char)-1 < 0 };
-
-// TODO we have similar traits in x_hls_utils.h, should consider unify.
-namespace _ap_type {
-template <typename _Tp>
-struct is_signed {
-  static const bool value = _Tp(-1) < _Tp(1);
-};
-
-template <typename _Tp>
-struct is_integral {
-  static const bool value = false;
-};
-#define DEF_IS_INTEGRAL(CTYPE)      \
-  template <>                       \
-  struct is_integral<CTYPE> {       \
-    static const bool value = true; \
-  };
-DEF_IS_INTEGRAL(bool)
-DEF_IS_INTEGRAL(char)
-DEF_IS_INTEGRAL(signed char)
-DEF_IS_INTEGRAL(unsigned char)
-DEF_IS_INTEGRAL(short)
-DEF_IS_INTEGRAL(unsigned short)
-DEF_IS_INTEGRAL(int)
-DEF_IS_INTEGRAL(unsigned int)
-DEF_IS_INTEGRAL(long)
-DEF_IS_INTEGRAL(unsigned long)
-DEF_IS_INTEGRAL(ap_slong)
-DEF_IS_INTEGRAL(ap_ulong)
-#undef DEF_IS_INTEGRAL
-
-template <bool, typename _Tp = void>
-struct enable_if {};
-// partial specialization for true
-template <typename _Tp>
-struct enable_if<true, _Tp> {
-  typedef _Tp type;
-};
-
-template <typename _Tp>
-struct remove_const {
-  typedef _Tp type;
-};
-
-template <typename _Tp>
-struct remove_const<_Tp const> {
-  typedef _Tp type;
-};
-} // namespace _ap_type
-
-// ----------------------------------------------------------------------
-
-// Define ssdm_int and _ssdm_op.
-// XXX deleted in open-source version
-
-#ifndef NON_C99STRING
-#define _AP_C99 true
-#else
-#define _AP_C99 false
-#endif
-
-static inline unsigned char guess_radix(const char* s) {
-  unsigned char rd = 10; ///< default radix
-  const char* p = s;
-  // skip neg sign if it exists
-  if (p[0] == '-' || p[0] == '+') ++p;
-  // guess based on following two bits.
-  if (p[0] == '0') {
-    if (p[1] == 'b' || p[1] == 'B') {
-      rd = 2;
-    } else if (p[1] == 'o' || p[1] == 'O') {
-      rd = 8;
-    } else if (p[1] == 'x' || p[1] == 'X') {
-      rd = 16;
-    } else if (p[1] == 'd' || p[1] == 'D') {
-      rd = 10;
-    }
-  }
-  return rd;
-}
-
-// ----------------------------------------------------------------------
-
-// Basic integral struct upon which ap_int and ap_fixed are defined.
-#ifdef __SYNTHESIS__
-// Use ssdm_int, a compiler dependent, attribute constrained integeral type as
-// basic data type.
-#define _AP_ROOT_TYPE ssdm_int
-// Basic ops.
-#define _AP_ROOT_op_concat(Ret, X, Y) _ssdm_op_concat(Ret, X, Y)
-#define _AP_ROOT_op_get_bit(Val, Bit) _ssdm_op_get_bit(Val, Bit)
-#define _AP_ROOT_op_set_bit(Val, Bit, Repl) _ssdm_op_set_bit(Val, Bit, Repl)
-#define _AP_ROOT_op_get_range(Val, Lo, Hi) _ssdm_op_get_range(Val, Lo, Hi)
-#define _AP_ROOT_op_set_range(Val, Lo, Hi, Repl) \
-  _ssdm_op_set_range(Val, Lo, Hi, Repl)
-#define _AP_ROOT_op_reduce(Op, Val) _ssdm_op_reduce(Op, Val)
-#else // ifdef __SYNTHESIS__
-// Use ap_private for compiler-independent basic data type
-template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
-class ap_private;
-/// model ssdm_int in standard C++ for simulation.
-template <int _AP_W, bool _AP_S>
-struct ssdm_int_sim {
-  /// integral type with template-specified width and signedness.
-  ap_private<_AP_W, _AP_S> V;
-  ssdm_int_sim() {}
-};
-#define _AP_ROOT_TYPE ssdm_int_sim
-// private's ref uses _AP_ROOT_TYPE.
-#include <etc/ap_private.h>
-// XXX The C-sim model cannot use GCC-extension
-// Basic ops. Ret and Val are ap_private.
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1 _AP_ROOT_op_concat(const _Tp1& Ret, const _Tp2& X, const _Tp3& Y) {
-  _Tp1 r = (X).operator,(Y);
-  return r;
-}
-#define _AP_ROOT_op_get_bit(Val, Bit) (Val).get_bit((Bit))
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1& _AP_ROOT_op_set_bit(_Tp1& Val, const _Tp2& Bit, const _Tp3& Repl) {
-  (Val).set_bit((Bit), (Repl));
-  return Val;
-}
-// notice the order of high and low index is different in ssdm call and
-// ap_private.range()...
-#define _AP_ROOT_op_get_range(Val, Lo, Hi) (Val).range((Hi), (Lo))
-template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
-inline _Tp1& _AP_ROOT_op_set_range(_Tp1& Val, const _Tp2& Lo, const _Tp3& Hi,
-                                   const _Tp4& Repl) {
-  (Val).range((Hi), (Lo)) = Repl;
-  return (Val);
-}
-#define _AP_ROOT_op_and_reduce(Val) (Val).and_reduce()
-#define _AP_ROOT_op_nand_reduce(Val) (Val).nand_reduce()
-#define _AP_ROOT_op_or_reduce(Val) (Val).or_reduce()
-#define _AP_ROOT_op_xor_reduce(Val) (Val).xor_reduce()
-// ## is the concatenation in preprocessor:
-#define _AP_ROOT_op_reduce(Op, Val) _AP_ROOT_op_##Op##_reduce(Val)
-#endif // ifdef __SYNTHESIS__ else
-
-// ----------------------------------------------------------------------
-
-// Constants for half, single, double pricision floating points
-#define HALF_MAN 10
-#define FLOAT_MAN 23
-#define DOUBLE_MAN 52
-
-#define HALF_EXP 5
-#define FLOAT_EXP 8
-#define DOUBLE_EXP 11
-
-#define BIAS(e) ((1L << (e - 1L)) - 1L)
-#define HALF_BIAS BIAS(HALF_EXP)
-#define FLOAT_BIAS BIAS(FLOAT_EXP)
-#define DOUBLE_BIAS BIAS(DOUBLE_EXP)
-
-#define APFX_IEEE_DOUBLE_E_MAX DOUBLE_BIAS
-#define APFX_IEEE_DOUBLE_E_MIN (-DOUBLE_BIAS + 1)
-
-INLINE ap_ulong doubleToRawBits(double pf) {
-  union {
-    ap_ulong __L;
-    double __D;
-  } LD;
-  LD.__D = pf;
-  return LD.__L;
-}
-
-INLINE unsigned int floatToRawBits(float pf) {
-  union {
-    unsigned int __L;
-    float __D;
-  } LD;
-  LD.__D = pf;
-  return LD.__L;
-}
-
-#if _AP_ENABLE_HALF_ == 1
-INLINE unsigned short halfToRawBits(half pf) {
-#ifdef __SYNTHESIS__
-  union {
-    unsigned short __L;
-    half __D;
-  } LD;
-  LD.__D = pf;
-  return LD.__L;
-#else
-  return pf.get_bits();
-#endif
-}
-#endif
-
-// usigned long long is at least 64-bit
-INLINE double rawBitsToDouble(ap_ulong pi) {
-  union {
-    ap_ulong __L;
-    double __D;
-  } LD;
-  LD.__L = pi;
-  return LD.__D;
-}
-
-// long is at least 32-bit
-INLINE float rawBitsToFloat(unsigned long pi) {
-  union {
-    unsigned int __L;
-    float __D;
-  } LD;
-  LD.__L = pi;
-  return LD.__D;
-}
-
-#if _AP_ENABLE_HALF_ == 1
-// short is at least 16-bit
-INLINE half rawBitsToHalf(unsigned short pi) {
-#ifdef __SYNTHESIS__
-  union {
-    unsigned short __L;
-    half __D;
-  } LD;
-  LD.__L = pi;
-  return LD.__D;
-#else
-  // sim model of half has a non-trivial constructor
-  half __D;
-  __D.set_bits(pi);
-  return __D;
-#endif
-}
-#endif
-
-#endif // ifndef __AP_COMMON_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_COMMON_H__
+#define __AP_COMMON_H__
+
+// ----------------------------------------------------------------------
+
+// Forward declaration of all AP types.
+#include <ap_decl.h>
+
+
+#ifdef __SYNTHESIS__
+#error "The open-source version of AP types does not support synthesis."
+#endif // ifdef __SYNTHESIS__
+#define _AP_ENABLE_HALF_ 0
+
+
+#if _AP_ENABLE_HALF_ == 1
+// Before ap_private definition.
+#ifdef __SYNTHESIS__
+#define _HLS_HALF_DEFINED_
+typedef __fp16 half;
+#else
+class half;
+#endif // __SYNTHESIS__
+#endif // _AP_ENABLE_HALF_
+
+// ----------------------------------------------------------------------
+
+// Macro functions
+#define AP_MAX(a, b) ((a) > (b) ? (a) : (b))
+#define AP_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define AP_ABS(a) ((a) >= 0 ? (a) : -(a))
+
+#ifndef AP_ASSERT
+#ifndef __SYNTHESIS__
+#include <assert.h>
+#define AP_ASSERT(cond, msg) assert((cond) && (msg))
+#else
+#define AP_ASSERT(cond, msg)
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_ASSERT
+
+#ifndef __SYNTHESIS__
+// for fprintf messages.
+#include <stdio.h>
+// for exit on error.
+#include <stdlib.h>
+#endif
+
+// same disable condition as assert.
+#if !defined(__SYNTHESIS__) && !defined(NDEBUG)
+
+#define _AP_DEBUG(cond, ...)                  \
+  do {                                        \
+    if ((cond)) {                             \
+      fprintf(stderr, "DEBUG: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                  \
+    }                                         \
+  } while (0)
+#define _AP_WARNING(cond, ...)                  \
+  do {                                          \
+    if ((cond)) {                               \
+      fprintf(stderr, "WARNING: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                    \
+    }                                           \
+  } while (0)
+#define _AP_ERROR(cond, ...)                  \
+  do {                                        \
+    if ((cond)) {                             \
+      fprintf(stderr, "ERROR: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                  \
+      abort();                                \
+    }                                         \
+  } while (0)
+
+#else // if !defined(__SYNTHESIS__) && !defined(NDEBUG)
+
+#define __AP_VOID_CAST static_cast<void>
+#define _AP_DEBUG(cond, ...) (__AP_VOID_CAST(0))
+#define _AP_WARNING(cond, ...) (__AP_VOID_CAST(0))
+#define _AP_ERROR(cond, ...) (__AP_VOID_CAST(0))
+
+#endif // if !defined(__SYNTHESIS__) && !defined(NDEBUG) else
+
+// ----------------------------------------------------------------------
+
+// Attribute only for synthesis
+#ifdef __SYNTHESIS__
+#define INLINE inline __attribute__((always_inline))
+//#define INLINE inline __attribute__((noinline))
+#else
+#define INLINE inline
+#endif
+
+#define AP_WEAK
+// __attribute__((weak))
+
+#ifndef AP_INT_MAX_W
+#define AP_INT_MAX_W 1024
+#endif
+
+#define BIT_WIDTH_UPPER_LIMIT (1 << 15)
+#if AP_INT_MAX_W > BIT_WIDTH_UPPER_LIMIT
+#error "Bitwidth exceeds 32768 (1 << 15), the maximum allowed value"
+#endif
+
+#define MAX_MODE(BITS) ((BITS + 1023) / 1024)
+
+// ----------------------------------------------------------------------
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+// for overload operator<<
+#include <iostream>
+#endif
+#endif // ifndef AP_AUTOCC
+
+#ifndef __SYNTHESIS__
+// for string format.
+#include <sstream>
+// for string.
+#include <string>
+#endif
+
+// for detecting if char is signed.
+enum { CHAR_IS_SIGNED = (char)-1 < 0 };
+
+// TODO we have similar traits in x_hls_utils.h, should consider unify.
+namespace _ap_type {
+template <typename _Tp>
+struct is_signed {
+  static const bool value = _Tp(-1) < _Tp(1);
+};
+
+template <typename _Tp>
+struct is_integral {
+  static const bool value = false;
+};
+#define DEF_IS_INTEGRAL(CTYPE)      \
+  template <>                       \
+  struct is_integral<CTYPE> {       \
+    static const bool value = true; \
+  };
+DEF_IS_INTEGRAL(bool)
+DEF_IS_INTEGRAL(char)
+DEF_IS_INTEGRAL(signed char)
+DEF_IS_INTEGRAL(unsigned char)
+DEF_IS_INTEGRAL(short)
+DEF_IS_INTEGRAL(unsigned short)
+DEF_IS_INTEGRAL(int)
+DEF_IS_INTEGRAL(unsigned int)
+DEF_IS_INTEGRAL(long)
+DEF_IS_INTEGRAL(unsigned long)
+DEF_IS_INTEGRAL(ap_slong)
+DEF_IS_INTEGRAL(ap_ulong)
+#undef DEF_IS_INTEGRAL
+
+template <bool, typename _Tp = void>
+struct enable_if {};
+// partial specialization for true
+template <typename _Tp>
+struct enable_if<true, _Tp> {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_const {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_const<_Tp const> {
+  typedef _Tp type;
+};
+} // namespace _ap_type
+
+// ----------------------------------------------------------------------
+
+// Define ssdm_int and _ssdm_op.
+// XXX deleted in open-source version
+
+#ifndef NON_C99STRING
+#define _AP_C99 true
+#else
+#define _AP_C99 false
+#endif
+
+static inline unsigned char guess_radix(const char* s) {
+  unsigned char rd = 10; ///< default radix
+  const char* p = s;
+  // skip neg sign if it exists
+  if (p[0] == '-' || p[0] == '+') ++p;
+  // guess based on following two bits.
+  if (p[0] == '0') {
+    if (p[1] == 'b' || p[1] == 'B') {
+      rd = 2;
+    } else if (p[1] == 'o' || p[1] == 'O') {
+      rd = 8;
+    } else if (p[1] == 'x' || p[1] == 'X') {
+      rd = 16;
+    } else if (p[1] == 'd' || p[1] == 'D') {
+      rd = 10;
+    }
+  }
+  return rd;
+}
+
+// ----------------------------------------------------------------------
+
+// Basic integral struct upon which ap_int and ap_fixed are defined.
+#ifdef __SYNTHESIS__
+// Use ssdm_int, a compiler dependent, attribute constrained integeral type as
+// basic data type.
+#define _AP_ROOT_TYPE ssdm_int
+// Basic ops.
+#define _AP_ROOT_op_concat(Ret, X, Y) _ssdm_op_concat(Ret, X, Y)
+#define _AP_ROOT_op_get_bit(Val, Bit) _ssdm_op_get_bit(Val, Bit)
+#define _AP_ROOT_op_set_bit(Val, Bit, Repl) _ssdm_op_set_bit(Val, Bit, Repl)
+#define _AP_ROOT_op_get_range(Val, Lo, Hi) _ssdm_op_get_range(Val, Lo, Hi)
+#define _AP_ROOT_op_set_range(Val, Lo, Hi, Repl) \
+  _ssdm_op_set_range(Val, Lo, Hi, Repl)
+#define _AP_ROOT_op_reduce(Op, Val) _ssdm_op_reduce(Op, Val)
+#else // ifdef __SYNTHESIS__
+// Use ap_private for compiler-independent basic data type
+template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
+class ap_private;
+/// model ssdm_int in standard C++ for simulation.
+template <int _AP_W, bool _AP_S>
+struct ssdm_int_sim {
+  /// integral type with template-specified width and signedness.
+  ap_private<_AP_W, _AP_S> V;
+  ssdm_int_sim() {}
+};
+#define _AP_ROOT_TYPE ssdm_int_sim
+// private's ref uses _AP_ROOT_TYPE.
+#include <etc/ap_private.h>
+// XXX The C-sim model cannot use GCC-extension
+// Basic ops. Ret and Val are ap_private.
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ROOT_op_concat(const _Tp1& Ret, const _Tp2& X, const _Tp3& Y) {
+  _Tp1 r = (X).operator,(Y);
+  return r;
+}
+#define _AP_ROOT_op_get_bit(Val, Bit) (Val).get_bit((Bit))
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1& _AP_ROOT_op_set_bit(_Tp1& Val, const _Tp2& Bit, const _Tp3& Repl) {
+  (Val).set_bit((Bit), (Repl));
+  return Val;
+}
+// notice the order of high and low index is different in ssdm call and
+// ap_private.range()...
+#define _AP_ROOT_op_get_range(Val, Lo, Hi) (Val).range((Hi), (Lo))
+template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+inline _Tp1& _AP_ROOT_op_set_range(_Tp1& Val, const _Tp2& Lo, const _Tp3& Hi,
+                                   const _Tp4& Repl) {
+  (Val).range((Hi), (Lo)) = Repl;
+  return (Val);
+}
+#define _AP_ROOT_op_and_reduce(Val) (Val).and_reduce()
+#define _AP_ROOT_op_nand_reduce(Val) (Val).nand_reduce()
+#define _AP_ROOT_op_or_reduce(Val) (Val).or_reduce()
+#define _AP_ROOT_op_xor_reduce(Val) (Val).xor_reduce()
+// ## is the concatenation in preprocessor:
+#define _AP_ROOT_op_reduce(Op, Val) _AP_ROOT_op_##Op##_reduce(Val)
+#endif // ifdef __SYNTHESIS__ else
+
+// ----------------------------------------------------------------------
+
+// Constants for half, single, double pricision floating points
+#define HALF_MAN 10
+#define FLOAT_MAN 23
+#define DOUBLE_MAN 52
+
+#define HALF_EXP 5
+#define FLOAT_EXP 8
+#define DOUBLE_EXP 11
+
+#define BIAS(e) ((1L << (e - 1L)) - 1L)
+#define HALF_BIAS BIAS(HALF_EXP)
+#define FLOAT_BIAS BIAS(FLOAT_EXP)
+#define DOUBLE_BIAS BIAS(DOUBLE_EXP)
+
+#define APFX_IEEE_DOUBLE_E_MAX DOUBLE_BIAS
+#define APFX_IEEE_DOUBLE_E_MIN (-DOUBLE_BIAS + 1)
+
+INLINE ap_ulong doubleToRawBits(double pf) {
+  union {
+    ap_ulong __L;
+    double __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+}
+
+INLINE unsigned int floatToRawBits(float pf) {
+  union {
+    unsigned int __L;
+    float __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+}
+
+#if _AP_ENABLE_HALF_ == 1
+INLINE unsigned short halfToRawBits(half pf) {
+#ifdef __SYNTHESIS__
+  union {
+    unsigned short __L;
+    half __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+#else
+  return pf.get_bits();
+#endif
+}
+#endif
+
+// usigned long long is at least 64-bit
+INLINE double rawBitsToDouble(ap_ulong pi) {
+  union {
+    ap_ulong __L;
+    double __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+}
+
+// long is at least 32-bit
+INLINE float rawBitsToFloat(unsigned long pi) {
+  union {
+    unsigned int __L;
+    float __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+}
+
+#if _AP_ENABLE_HALF_ == 1
+// short is at least 16-bit
+INLINE half rawBitsToHalf(unsigned short pi) {
+#ifdef __SYNTHESIS__
+  union {
+    unsigned short __L;
+    half __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+#else
+  // sim model of half has a non-trivial constructor
+  half __D;
+  __D.set_bits(pi);
+  return __D;
+#endif
+}
+#endif
+
+#endif // ifndef __AP_COMMON_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_decl.h b/hls4ml/templates/vivado/ap_types/ap_decl.h
index ddd00f1c7f..ddb8dd4a76 100644
--- a/hls4ml/templates/vivado/ap_types/ap_decl.h
+++ b/hls4ml/templates/vivado/ap_types/ap_decl.h
@@ -1,212 +1,212 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_DECL_H__
-#define __AP_DECL_H__
-
-// ----------------------------------------------------------------------
-
-#if !defined(__AP_FIXED_H__) && !defined(__AP_INT_H__) && !defined(__AUTOPILOT_CBE_H__) && !defined(__HLS_HALF_H__)
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-// Test __SYNTHESIS__ only for mode
-#if !defined(__SYNTHESIS__) && (defined(AESL_SYN) || defined(__HLS_SYN__))
-//#pragma message "AESL_SYN and __HLS_SYN__ should be replaced by __SYNTHESIS__"
-#define __SYNTHESIS__
-#endif
-
-/* for safety*/
-#if (defined(_AP_N) || defined(_AP_C))
-#error One or more of the following is defined: _AP_N, _AP_C. Definition conflicts with their usage as template parameters.
-#endif
-
-/* for safety*/
-#if (defined(_AP_W) || defined(_AP_I) || defined(_AP_S) || defined(_AP_Q) || \
-     defined(_AP_O) || defined(_AP_W2) || defined(_AP_I2) ||                 \
-     defined(_AP_S2) || defined(_AP_Q2) || defined(_AP_O2) ||                \
-     defined(_AP_N) || defined(_AP_N2))
-#error \
-    "One or more of the following is defined: _AP_W, _AP_I, _AP_S, _AP_Q, _AP_O,  _AP_N, _AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2. Definition conflicts with their usage as template parameters."
-#endif
-
-/*for safety*/
-#if (defined(_AP_W3) || defined(_AP_S3) || defined(_AP_W4) || defined(_AP_S4))
-#error \
-    "One or more of the following is defined: _AP_W3, _AP_S3, _AP_W4,_AP_S4. Definition conflicts with their usage as template parameters."
-#endif
-
-#if (defined(_AP_W1) || defined(_AP_S1) || defined(_AP_T) || \
-     defined(_AP_T1) || defined(_AP_T2) || defined(_AP_T3) || defined(_AP_T4))
-#error \
-    "One or more of the following is defined: _AP_W1, _AP_S1, _AP_T,  _AP_T1, _AP_T2, _AP_T3, _AP_T4. Definition conflicts with their usage as template parameters."
-#endif
-
-#ifndef __cplusplus
-#error "AP data type can only be used in C++"
-#endif
-
-// ----------------------------------------------------------------------
-
-#ifndef __SC_COMPATIBLE__
-/// ap_fixed quantification mode
-enum ap_q_mode {
-  AP_RND,         //< rounding to plus infinity
-  AP_RND_ZERO,    //< rounding to zero
-  AP_RND_MIN_INF, //< rounding to minus infinity
-  AP_RND_INF,     //< rounding to infinity
-  AP_RND_CONV,    //< convergent rounding
-  AP_TRN,         //< truncation
-  AP_TRN_ZERO,    //< truncation to zero
-};
-
-// FIXME for legacy code
-#ifndef SYSTEMC_INCLUDED
-#define SC_RND AP_RND
-#define SC_RND_ZERO AP_RND_ZERO
-#define SC_RND_MIN_INF AP_RND_MIN_INF
-#define SC_RND_INF AP_RND_INF
-#define SC_RND_CONV AP_RND_CONV
-#define SC_TRN AP_TRN
-#define SC_TRN_ZERO AP_TRN_ZERO
-#endif // !defined(SYSTEMC_INCLUDED)
-
-/// ap_fixed saturation mode
-enum ap_o_mode {
-  AP_SAT,      //< saturation
-  AP_SAT_ZERO, //< saturation to zero
-  AP_SAT_SYM,  //< symmetrical saturation
-  AP_WRAP,     //< wrap-around (*)
-  AP_WRAP_SM,  //< sign magnitude wrap-around (*)
-};
-
-// FIXME for legacy code
-#ifndef SYSTEMC_INCLUDED
-#define SC_SAT AP_SAT
-#define SC_SAT_ZERO AP_SAT_ZERO
-#define SC_SAT_SYM AP_SAT_SYM
-#define SC_WRAP AP_WRAP
-#define SC_WRAP_SM AP_WRAP_SM
-#endif // !defined(SYSTEMC_INCLUDED)
-
-#else // defined(__SC_COMPATIBLE__)
-
-// There will not be sc_fxdefs.h, and the emu should be defined by ap_fixed.
-
-/// ap_fixed quantification mode
-enum ap_q_mode {
-  SC_RND,         //< rounding to plus infinity
-  SC_RND_ZERO,    //< rounding to zero
-  SC_RND_MIN_INF, //< rounding to minus infinity
-  SC_RND_INF,     //< rounding to infinity
-  SC_RND_CONV,    //< convergent rounding
-  SC_TRN,         //< truncation
-  SC_TRN_ZERO,    //< truncation to zero
-};
-
-#define AP_RND SC_RND
-#define AP_RND_ZERO SC_RND_ZERO
-#define AP_RND_MIN_INF SC_RND_MIN_INF
-#define AP_RND_INF SC_RND_INF
-#define AP_RND_CONV SC_RND_CONV
-#define AP_TRN SC_TRN
-#define AP_TRN_ZERO SC_TRN_ZERO
-
-/// ap_fixed saturation mode
-enum ap_o_mode {
-  SC_SAT,      //< saturation
-  SC_SAT_ZERO, //< saturation to zero
-  SC_SAT_SYM,  //< symmetrical saturation
-  SC_WRAP,     //< wrap-around (*)
-  SC_WRAP_SM,  //< sign magnitude wrap-around (*)
-};
-
-#define AP_SAT SC_SAT
-#define AP_SAT_ZERO SC_SAT_ZERO
-#define AP_SAT_SYM SC_SAT_SYM
-#define AP_WRAP SC_WRAP
-#define AP_WRAP_SM SC_WRAP_SM
-
-#endif // defined(__SC_COMPATIBLE__)
-
-template <int _AP_W, bool _AP_S>
-struct ap_int_base;
-
-template <int _AP_W>
-struct ap_int;
-
-template <int _AP_W>
-struct ap_uint;
-
-template <int _AP_W, bool _AP_S>
-struct ap_range_ref;
-
-template <int _AP_W, bool _AP_S>
-struct ap_bit_ref;
-
-template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
-struct ap_concat_ref;
-
-template <int _AP_W, int _AP_I, bool _AP_S = true, ap_q_mode _AP_Q = AP_TRN,
-          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
-struct ap_fixed_base;
-
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
-          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
-struct ap_fixed;
-
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
-          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
-struct ap_ufixed;
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_range_ref;
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_bit_ref;
-
-/// string base mode
-enum BaseMode { AP_BIN = 2, AP_OCT = 8, AP_DEC = 10, AP_HEX = 16 };
-
-#ifndef SYSTEMC_INCLUDED
-#define SC_BIN 2
-#define SC_OCT 8
-#define SC_DEC 10
-#define SC_HEX 16
-#endif // !defined(SYSTEMC_INCLUDED)
-
-// Alias C data types
-#ifdef _MSC_VER
-typedef signed __int64 ap_slong;
-typedef unsigned __int64 ap_ulong;
-#else  // !defined(_MSC_VER)
-typedef signed long long ap_slong;
-typedef unsigned long long ap_ulong;
-#endif // !defined(_MSC_VER)
-
-enum {
-  _AP_SIZE_char = 8,
-  _AP_SIZE_short = sizeof(short) * 8,
-  _AP_SIZE_int = sizeof(int) * 8,
-  _AP_SIZE_long = sizeof(long) * 8,
-  _AP_SIZE_ap_slong = sizeof(ap_slong) * 8
-};
-
-#endif // !defined(__AP_DECL_H__)
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_DECL_H__
+#define __AP_DECL_H__
+
+// ----------------------------------------------------------------------
+
+#if !defined(__AP_FIXED_H__) && !defined(__AP_INT_H__) && !defined(__AUTOPILOT_CBE_H__) && !defined(__HLS_HALF_H__)
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+// Test __SYNTHESIS__ only for mode
+#if !defined(__SYNTHESIS__) && (defined(AESL_SYN) || defined(__HLS_SYN__))
+//#pragma message "AESL_SYN and __HLS_SYN__ should be replaced by __SYNTHESIS__"
+#define __SYNTHESIS__
+#endif
+
+/* for safety*/
+#if (defined(_AP_N) || defined(_AP_C))
+#error One or more of the following is defined: _AP_N, _AP_C. Definition conflicts with their usage as template parameters.
+#endif
+
+/* for safety*/
+#if (defined(_AP_W) || defined(_AP_I) || defined(_AP_S) || defined(_AP_Q) || \
+     defined(_AP_O) || defined(_AP_W2) || defined(_AP_I2) ||                 \
+     defined(_AP_S2) || defined(_AP_Q2) || defined(_AP_O2) ||                \
+     defined(_AP_N) || defined(_AP_N2))
+#error \
+    "One or more of the following is defined: _AP_W, _AP_I, _AP_S, _AP_Q, _AP_O,  _AP_N, _AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2. Definition conflicts with their usage as template parameters."
+#endif
+
+/*for safety*/
+#if (defined(_AP_W3) || defined(_AP_S3) || defined(_AP_W4) || defined(_AP_S4))
+#error \
+    "One or more of the following is defined: _AP_W3, _AP_S3, _AP_W4,_AP_S4. Definition conflicts with their usage as template parameters."
+#endif
+
+#if (defined(_AP_W1) || defined(_AP_S1) || defined(_AP_T) || \
+     defined(_AP_T1) || defined(_AP_T2) || defined(_AP_T3) || defined(_AP_T4))
+#error \
+    "One or more of the following is defined: _AP_W1, _AP_S1, _AP_T,  _AP_T1, _AP_T2, _AP_T3, _AP_T4. Definition conflicts with their usage as template parameters."
+#endif
+
+#ifndef __cplusplus
+#error "AP data type can only be used in C++"
+#endif
+
+// ----------------------------------------------------------------------
+
+#ifndef __SC_COMPATIBLE__
+/// ap_fixed quantification mode
+enum ap_q_mode {
+  AP_RND,         //< rounding to plus infinity
+  AP_RND_ZERO,    //< rounding to zero
+  AP_RND_MIN_INF, //< rounding to minus infinity
+  AP_RND_INF,     //< rounding to infinity
+  AP_RND_CONV,    //< convergent rounding
+  AP_TRN,         //< truncation
+  AP_TRN_ZERO,    //< truncation to zero
+};
+
+// FIXME for legacy code
+#ifndef SYSTEMC_INCLUDED
+#define SC_RND AP_RND
+#define SC_RND_ZERO AP_RND_ZERO
+#define SC_RND_MIN_INF AP_RND_MIN_INF
+#define SC_RND_INF AP_RND_INF
+#define SC_RND_CONV AP_RND_CONV
+#define SC_TRN AP_TRN
+#define SC_TRN_ZERO AP_TRN_ZERO
+#endif // !defined(SYSTEMC_INCLUDED)
+
+/// ap_fixed saturation mode
+enum ap_o_mode {
+  AP_SAT,      //< saturation
+  AP_SAT_ZERO, //< saturation to zero
+  AP_SAT_SYM,  //< symmetrical saturation
+  AP_WRAP,     //< wrap-around (*)
+  AP_WRAP_SM,  //< sign magnitude wrap-around (*)
+};
+
+// FIXME for legacy code
+#ifndef SYSTEMC_INCLUDED
+#define SC_SAT AP_SAT
+#define SC_SAT_ZERO AP_SAT_ZERO
+#define SC_SAT_SYM AP_SAT_SYM
+#define SC_WRAP AP_WRAP
+#define SC_WRAP_SM AP_WRAP_SM
+#endif // !defined(SYSTEMC_INCLUDED)
+
+#else // defined(__SC_COMPATIBLE__)
+
+// There will not be sc_fxdefs.h, and the emu should be defined by ap_fixed.
+
+/// ap_fixed quantification mode
+enum ap_q_mode {
+  SC_RND,         //< rounding to plus infinity
+  SC_RND_ZERO,    //< rounding to zero
+  SC_RND_MIN_INF, //< rounding to minus infinity
+  SC_RND_INF,     //< rounding to infinity
+  SC_RND_CONV,    //< convergent rounding
+  SC_TRN,         //< truncation
+  SC_TRN_ZERO,    //< truncation to zero
+};
+
+#define AP_RND SC_RND
+#define AP_RND_ZERO SC_RND_ZERO
+#define AP_RND_MIN_INF SC_RND_MIN_INF
+#define AP_RND_INF SC_RND_INF
+#define AP_RND_CONV SC_RND_CONV
+#define AP_TRN SC_TRN
+#define AP_TRN_ZERO SC_TRN_ZERO
+
+/// ap_fixed saturation mode
+enum ap_o_mode {
+  SC_SAT,      //< saturation
+  SC_SAT_ZERO, //< saturation to zero
+  SC_SAT_SYM,  //< symmetrical saturation
+  SC_WRAP,     //< wrap-around (*)
+  SC_WRAP_SM,  //< sign magnitude wrap-around (*)
+};
+
+#define AP_SAT SC_SAT
+#define AP_SAT_ZERO SC_SAT_ZERO
+#define AP_SAT_SYM SC_SAT_SYM
+#define AP_WRAP SC_WRAP
+#define AP_WRAP_SM SC_WRAP_SM
+
+#endif // defined(__SC_COMPATIBLE__)
+
+template <int _AP_W, bool _AP_S>
+struct ap_int_base;
+
+template <int _AP_W>
+struct ap_int;
+
+template <int _AP_W>
+struct ap_uint;
+
+template <int _AP_W, bool _AP_S>
+struct ap_range_ref;
+
+template <int _AP_W, bool _AP_S>
+struct ap_bit_ref;
+
+template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+struct ap_concat_ref;
+
+template <int _AP_W, int _AP_I, bool _AP_S = true, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_fixed_base;
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_fixed;
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_ufixed;
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_range_ref;
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_bit_ref;
+
+/// string base mode
+enum BaseMode { AP_BIN = 2, AP_OCT = 8, AP_DEC = 10, AP_HEX = 16 };
+
+#ifndef SYSTEMC_INCLUDED
+#define SC_BIN 2
+#define SC_OCT 8
+#define SC_DEC 10
+#define SC_HEX 16
+#endif // !defined(SYSTEMC_INCLUDED)
+
+// Alias C data types
+#ifdef _MSC_VER
+typedef signed __int64 ap_slong;
+typedef unsigned __int64 ap_ulong;
+#else  // !defined(_MSC_VER)
+typedef signed long long ap_slong;
+typedef unsigned long long ap_ulong;
+#endif // !defined(_MSC_VER)
+
+enum {
+  _AP_SIZE_char = 8,
+  _AP_SIZE_short = sizeof(short) * 8,
+  _AP_SIZE_int = sizeof(int) * 8,
+  _AP_SIZE_long = sizeof(long) * 8,
+  _AP_SIZE_ap_slong = sizeof(ap_slong) * 8
+};
+
+#endif // !defined(__AP_DECL_H__)
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed.h b/hls4ml/templates/vivado/ap_types/ap_fixed.h
index cd0192bcb9..a25913a3c8 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed.h
@@ -1,360 +1,360 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_H__
-#define __AP_FIXED_H__
-
-#include <ap_common.h>
-#include <ap_fixed_base.h>
-#include <ap_fixed_ref.h>
-
-//---------------------------------------------------------------
-
-/// Signed Arbitrary Precision Fixed-Point Type.
-// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-struct ap_fixed : ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> {
-  typedef ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> Base;
-  // Constructor
-  /// default ctor
-  INLINE ap_fixed() : Base() {}
-
-  /// default copy ctor
-  INLINE ap_fixed(const ap_fixed& op) { Base::V = op.V; }
-
-  /// copy ctor from ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                      _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                               _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  //// from ap_fixed
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //// from ap_ufixed.
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  /// copy ctor from ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  //// from ap_int.
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const volatile ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //// from ap_uint.
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const volatile ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  // from ap_bit_ref.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  // from ap_range_ref.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  // from ap_concat_ref.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_fixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
-      : Base(op) {}
-
-  // from af_bit_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  // from af_range_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-// from c types.
-#define CTOR(TYPE) \
-  INLINE ap_fixed(TYPE v) : Base(v) {}
-
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  CTOR(half)
-#endif
-  CTOR(float)
-  CTOR(double)
-#undef CTOR
-
-  INLINE ap_fixed(const char* s) : Base(s) {}
-
-  INLINE ap_fixed(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  // The assignment operator is technically inherited; however, it is always
-  // hidden by an explicitly or implicitly defined assignment operator for the
-  // derived class.
-  /* XXX ctor will be used when right is not of proper type. */
-  INLINE ap_fixed& operator=(
-      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(
-      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-
-  INLINE ap_fixed& operator=(
-      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(
-      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-}; // struct ap_fixed.
-
-//-------------------------------------------------------------------
-
-// Unsigned Arbitrary Precision Fixed-Point Type.
-// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-struct ap_ufixed : ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> {
-  typedef ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> Base;
-  // Constructor
-  /// default ctor
-  INLINE ap_ufixed() : Base() {}
-
-  /// default copy ctor
-  INLINE ap_ufixed(const ap_ufixed& op) { Base::V = op.V; }
-
-  /// copy ctor from ap_fixed_base
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                       _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  /// copy ctor from ap_fixed_base
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                                _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  /// copy ctor from ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const volatile ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const volatile ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_ufixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-#define CTOR(TYPE) \
-  INLINE ap_ufixed(TYPE v) : Base(v) {}
-
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  CTOR(half)
-#endif
-  CTOR(float)
-  CTOR(double)
-#undef CTOR
-
-  INLINE ap_ufixed(const char* s) : Base(s) {}
-
-  INLINE ap_ufixed(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  INLINE ap_ufixed& operator=(
-      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(
-      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-
-  INLINE ap_ufixed& operator=(
-      const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O,
-                                                 _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-}; // struct ap_ufixed
-
-
-#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
-// XXX sc_trace overload for ap_fixed is already included in
-// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-INLINE void sc_trace(sc_core::sc_trace_file* tf,
-                     const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
-                     const std::string& name) {
-  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-INLINE void sc_trace(sc_core::sc_trace_file* tf,
-                     const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
-                     const std::string& name) {
-  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-#endif // System C sim
-
-// Specialization of std containers, so that std::complex<ap_fixed> can have its
-// image part automatically zero-initialized when only real part is provided.
-#include <ap_fixed_special.h>
-
-#endif // ifndef __AP_FIXED_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_H__
+#define __AP_FIXED_H__
+
+#include <ap_common.h>
+#include <ap_fixed_base.h>
+#include <ap_fixed_ref.h>
+
+//---------------------------------------------------------------
+
+/// Signed Arbitrary Precision Fixed-Point Type.
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_fixed : ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> {
+  typedef ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> Base;
+  // Constructor
+  /// default ctor
+  INLINE ap_fixed() : Base() {}
+
+  /// default copy ctor
+  INLINE ap_fixed(const ap_fixed& op) { Base::V = op.V; }
+
+  /// copy ctor from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                      _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                               _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  //// from ap_fixed
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //// from ap_ufixed.
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  /// copy ctor from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  //// from ap_int.
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const volatile ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //// from ap_uint.
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const volatile ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  // from ap_bit_ref.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  // from ap_range_ref.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  // from ap_concat_ref.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_fixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
+      : Base(op) {}
+
+  // from af_bit_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  // from af_range_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+// from c types.
+#define CTOR(TYPE) \
+  INLINE ap_fixed(TYPE v) : Base(v) {}
+
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  CTOR(half)
+#endif
+  CTOR(float)
+  CTOR(double)
+#undef CTOR
+
+  INLINE ap_fixed(const char* s) : Base(s) {}
+
+  INLINE ap_fixed(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  // The assignment operator is technically inherited; however, it is always
+  // hidden by an explicitly or implicitly defined assignment operator for the
+  // derived class.
+  /* XXX ctor will be used when right is not of proper type. */
+  INLINE ap_fixed& operator=(
+      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+
+  INLINE ap_fixed& operator=(
+      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+}; // struct ap_fixed.
+
+//-------------------------------------------------------------------
+
+// Unsigned Arbitrary Precision Fixed-Point Type.
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_ufixed : ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> {
+  typedef ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> Base;
+  // Constructor
+  /// default ctor
+  INLINE ap_ufixed() : Base() {}
+
+  /// default copy ctor
+  INLINE ap_ufixed(const ap_ufixed& op) { Base::V = op.V; }
+
+  /// copy ctor from ap_fixed_base
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                       _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  /// copy ctor from ap_fixed_base
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                                _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  /// copy ctor from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const volatile ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const volatile ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_ufixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_ufixed(TYPE v) : Base(v) {}
+
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  CTOR(half)
+#endif
+  CTOR(float)
+  CTOR(double)
+#undef CTOR
+
+  INLINE ap_ufixed(const char* s) : Base(s) {}
+
+  INLINE ap_ufixed(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  INLINE ap_ufixed& operator=(
+      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+
+  INLINE ap_ufixed& operator=(
+      const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O,
+                                                 _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+}; // struct ap_ufixed
+
+
+#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
+// XXX sc_trace overload for ap_fixed is already included in
+// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+INLINE void sc_trace(sc_core::sc_trace_file* tf,
+                     const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
+                     const std::string& name) {
+  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+INLINE void sc_trace(sc_core::sc_trace_file* tf,
+                     const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
+                     const std::string& name) {
+  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+#endif // System C sim
+
+// Specialization of std containers, so that std::complex<ap_fixed> can have its
+// image part automatically zero-initialized when only real part is provided.
+#include <ap_fixed_special.h>
+
+#endif // ifndef __AP_FIXED_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed_base.h b/hls4ml/templates/vivado/ap_types/ap_fixed_base.h
index 1d94b938f0..216f9772e5 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed_base.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed_base.h
@@ -1,2354 +1,2354 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_BASE_H__
-#define __AP_FIXED_BASE_H__
-
-#ifndef __AP_FIXED_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-// for ap_int_base and its reference types.
-#include <ap_int.h>
-#ifndef __SYNTHESIS__
-#if _AP_ENABLE_HALF_ == 1
-// for half type
-#include <hls_half.h>
-#endif
-// for std io
-#include <iostream>
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-#else // __cplusplus
-
-// for warning on unsupported rounding mode in conversion to float/double.
-#if !defined(__SYNTHESIS__) && __cplusplus >= 201103L && \
-    (defined(__gnu_linux__) || defined(_WIN32))
-#define AP_FIXED_ENABLE_CPP_FENV 1
-#include <cfenv>
-#endif
-
-// ----------------------------------------------------------------------
-
-/* Major TODO
-  long double support: constructor, assign and other operators.
-  binary operators with ap_fixed_base and const char*.
-  return ap_fixed/ap_ufixed when result signedness is known.
-*/
-
-// Helper function in conversion to floating point types.
-
-#ifdef __SYNTHESIS__
-#define _AP_ctype_op_get_bit(var, index) _AP_ROOT_op_get_bit(var, index)
-#define _AP_ctype_op_set_bit(var, index, x) _AP_ROOT_op_set_bit(var, index, x)
-#define _AP_ctype_op_get_range(var, low, high) \
-  _AP_ROOT_op_get_range(var, low, high)
-#define _AP_ctype_op_set_range(var, low, high, x) \
-  _AP_ROOT_op_set_range(var, low, high, x)
-#else // ifdef __SYNTHESIS__
-template <typename _Tp1, typename _Tp2>
-inline bool _AP_ctype_op_get_bit(_Tp1& var, const _Tp2& index) {
-  return !!(var & (1ull << (index)));
-}
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1 _AP_ctype_op_set_bit(_Tp1& var, const _Tp2& index, const _Tp3& x) {
-  var |= (((x) ? 1ull : 0ull) << (index));
-  return var;
-}
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1 _AP_ctype_op_get_range(_Tp1& var, const _Tp2& low,
-                                   const _Tp3& high) {
-  _Tp1 r = var;
-  ap_ulong mask = -1ll;
-  mask >>= (sizeof(_Tp1) * 8 - ((high) - (low) + 1));
-  r >>= (low);
-  r &= mask;
-  return r;
-}
-template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
-inline _Tp1 _AP_ctype_op_set_range(_Tp1& var, const _Tp2& low, const _Tp3& high,
-                                   const _Tp4& x) {
-  ap_ulong mask = -1ll;
-  mask >>= (_AP_SIZE_ap_slong - ((high) - (low) + 1));
-  var &= ~(mask << (low));
-  var |= ((mask & x) << (low));
-  return var;
-}
-#endif // ifdef __SYNTHESIS__
-
-
-// trait for letting base class to return derived class.
-// Notice that derived class template is incomplete, and we cannot use
-// the member of the derived class.
-template <int _AP_W2, int _AP_I2, bool _AP_S2>
-struct _ap_fixed_factory;
-template <int _AP_W2, int _AP_I2>
-struct _ap_fixed_factory<_AP_W2, _AP_I2, true> {
-  typedef ap_fixed<_AP_W2, _AP_I2> type;
-};
-template <int _AP_W2, int _AP_I2>
-struct _ap_fixed_factory<_AP_W2, _AP_I2, false> {
-  typedef ap_ufixed<_AP_W2, _AP_I2> type;
-};
-
-/// ap_fixed_base: AutoPilot fixed point.
-/** partial specialization of signed.
-  @tparam _AP_W width.
-  @tparam _AP_I integral part width.
-  @tparam _AP_S signed.
-  @tparam _AP_Q quantization mode. Default is AP_TRN.
-  @tparam _AP_O saturation mode. Default is AP_WRAP.
-  @tparam _AP_N saturation wrap value. Default is 0.
- */
-// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct ap_fixed_base : _AP_ROOT_TYPE<_AP_W, _AP_S> {
- public:
-  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
-  static const int width = _AP_W;
-  static const int iwidth = _AP_I;
-  static const ap_q_mode qmode = _AP_Q;
-  static const ap_o_mode omode = _AP_O;
-
-  /// Return type trait.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2>
-  struct RType {
-    enum {
-      _AP_F = _AP_W - _AP_I,
-      F2 = _AP_W2 - _AP_I2,
-      mult_w = _AP_W + _AP_W2,
-      mult_i = _AP_I + _AP_I2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
-               1 + AP_MAX(_AP_F, F2),
-      plus_i =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1 +
-          AP_MAX(_AP_F, F2),
-      minus_i =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-#ifndef __SC_COMPATIBLE__
-      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0),
-#else
-      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
-#endif
-      div_i = _AP_S2 + _AP_I + F2,
-      div_s = _AP_S || _AP_S2,
-      logic_w =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
-          AP_MAX(_AP_F, F2),
-      logic_i = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-
-    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> lhs;
-    typedef ap_fixed_base<_AP_W2, _AP_I2, _AP_S2> rhs;
-
-    typedef ap_fixed_base<mult_w, mult_i, mult_s> mult_base;
-    typedef ap_fixed_base<plus_w, plus_i, plus_s> plus_base;
-    typedef ap_fixed_base<minus_w, minus_i, minus_s> minus_base;
-    typedef ap_fixed_base<logic_w, logic_i, logic_s> logic_base;
-    typedef ap_fixed_base<div_w, div_i, div_s> div_base;
-    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> arg1_base;
-
-    typedef typename _ap_fixed_factory<mult_w, mult_i, mult_s>::type mult;
-    typedef typename _ap_fixed_factory<plus_w, plus_i, plus_s>::type plus;
-    typedef typename _ap_fixed_factory<minus_w, minus_i, minus_s>::type minus;
-    typedef typename _ap_fixed_factory<logic_w, logic_i, logic_s>::type logic;
-    typedef typename _ap_fixed_factory<div_w, div_i, div_s>::type div;
-    typedef typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type arg1;
-  };
-
- private:
-#ifndef __SYNTHESIS__
-  // This cannot handle hex float format string.
-  void fromString(const std::string& val, unsigned char radix) {
-    _AP_ERROR(!(radix == 2 || radix == 8 || radix == 10 || radix == 16),
-              "ap_fixed_base::fromString(%s, %d)", val.c_str(), radix);
-
-    Base::V = 0;
-    int startPos = 0;
-    int endPos = val.length();
-    int decPos = val.find(".");
-    if (decPos == -1) decPos = endPos;
-
-    // handle sign
-    bool isNegative = false;
-    if (val[0] == '-') {
-      isNegative = true;
-      ++startPos;
-    } else if (val[0] == '+')
-      ++startPos;
-
-    // If there are no integer bits, e.g.:
-    // .0000XXXX, then keep at least one bit.
-    // If the width is greater than the number of integer bits, e.g.:
-    // XXXX.XXXX, then we keep the integer bits
-    // if the number of integer bits is greater than the width, e.g.:
-    // XXX000 then we keep the integer bits.
-    // Always keep one bit.
-    ap_fixed_base<AP_MAX(_AP_I, 4) + 4, AP_MAX(_AP_I, 4) + 4, false>
-        integer_bits = 0;
-
-    // Figure out if we can shift instead of multiply
-    unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
-
-    //std::cout << "\n\n" << val << "\n";
-    //std::cout << startPos << " " << decPos << " " << endPos << "\n";
-
-    bool sticky_int = false;
-
-    // Traverse the integer digits from the MSD, multiplying by radix as we go.
-    for (int i = startPos; i < decPos; i++) {
-      // Get a digit
-      char cdigit = val[i];
-      if (cdigit == '\0') continue;
-      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
-
-      sticky_int |= integer_bits[AP_MAX(_AP_I, 4) + 4 - 1] |
-                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 2] |
-                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] |
-                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 4];
-      // Shift or multiply the value by the radix
-      if (shift)
-        integer_bits <<= shift;
-      else
-        integer_bits *= radix;
-
-      // Add in the digit we just interpreted
-      integer_bits += digit;
-      //std::cout << "idigit = " << digit << " " << integer_bits.to_string()
-      //    << "  " << sticky_int <<  "\n";
-    }
-    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] =
-        integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] | sticky_int;
-
-    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 0) + 4 + 4, 4, false> fractional_bits = 0;
-    bool sticky = false;
-
-    // Traverse the fractional digits from the LSD, dividing by radix as we go.
-    for (int i = endPos - 1; i >= decPos + 1; i--) {
-      // Get a digit
-      char cdigit = val[i];
-      if (cdigit == '\0') continue;
-      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
-      // Add in the digit we just interpreted
-      fractional_bits += digit;
-
-      sticky |= fractional_bits[0] | fractional_bits[1] | fractional_bits[2] |
-                fractional_bits[3];
-      // Shift or divide the value by the radix
-      if (shift)
-        fractional_bits >>= shift;
-      else
-        fractional_bits /= radix;
-
-      //std::cout << "fdigit = " << digit << " " << fractional_bits.to_string()
-      //    << " " << sticky << "\n";
-    }
-
-    //std::cout << "Int =" << integer_bits.to_string() << " " <<
-    //    fractional_bits.to_string() << "\n";
-
-    fractional_bits[0] = fractional_bits[0] | sticky;
-
-    if (isNegative)
-      *this = -(integer_bits + fractional_bits);
-    else
-      *this = integer_bits + fractional_bits;
-
-    //std::cout << "end = " << this->to_string(16) << "\n";
-  }
-
-  /// report invalid constrction of ap_fixed_base
-  INLINE void report() {
-    if (!_AP_S && _AP_O == AP_WRAP_SM) {
-      fprintf(stderr, "ap_ufxied<...> cannot support AP_WRAP_SM.\n");
-      exit(1);
-    }
-    if (_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024) {
-      fprintf(stderr,
-              "[E] ap_%sfixed<%d, ...>: Bitwidth exceeds the "
-              "default max value %d. Please use macro "
-              "AP_INT_MAX_W to set a larger max value.\n",
-              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
-      exit(1);
-    }
-  }
-#else
-  INLINE void report() {}
-#endif // ifdef __SYNTHESIS__
-
-  /// @name helper functions.
-  //  @{
-  INLINE void overflow_adjust(bool underflow, bool overflow, bool lD,
-                              bool sign) {
-    if (!underflow && !overflow) return;
-    if (_AP_O == AP_WRAP) {
-      if (_AP_N == 0) return;
-      if (_AP_S) {
-        // signed AP_WRAP
-        // n_bits == 1
-        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
-        if (_AP_N > 1) {
-          // n_bits > 1
-          ap_int_base<_AP_W, false> mask(-1);
-          if (sign) mask.V = 0;
-          Base::V =
-              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
-        }
-      } else {
-        // unsigned AP_WRAP
-        ap_int_base<_AP_W, false> mask(-1);
-        Base::V =
-            _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 1, mask.V);
-      }
-    } else if (_AP_O == AP_SAT_ZERO) {
-      Base::V = 0;
-    } else if (_AP_O == AP_WRAP_SM && _AP_S) {
-      bool Ro = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-      if (_AP_N == 0) {
-        if (lD != Ro) {
-          Base::V = ~Base::V;
-          Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, lD);
-        }
-      } else {
-        if (_AP_N == 1 && sign != Ro) {
-          Base::V = ~Base::V;
-        } else if (_AP_N > 1) {
-          bool lNo = _AP_ROOT_op_get_bit(Base::V, _AP_W - _AP_N);
-          if (lNo == sign) Base::V = ~Base::V;
-          ap_int_base<_AP_W, false> mask(-1);
-          if (sign) mask.V = 0;
-          Base::V =
-              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
-        }
-        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
-      }
-    } else {
-      if (_AP_S) {
-        if (overflow) {
-          Base::V = 1;
-          Base::V <<= _AP_W - 1;
-          Base::V = ~Base::V;
-        } else if (underflow) {
-          Base::V = 1;
-          Base::V <<= _AP_W - 1;
-          if (_AP_O == AP_SAT_SYM) Base::V |= 1;
-        }
-      } else {
-        if (overflow)
-          Base::V = ~(ap_int_base<_AP_W, false>(0).V);
-        else if (underflow)
-          Base::V = 0;
-      }
-    }
-  }
-
-  INLINE bool quantization_adjust(bool qb, bool r, bool s) {
-    bool carry = (bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-    if (_AP_Q == AP_TRN) return false;
-    if (_AP_Q == AP_RND_ZERO)
-      qb &= s || r;
-    else if (_AP_Q == AP_RND_MIN_INF)
-      qb &= r;
-    else if (_AP_Q == AP_RND_INF)
-      qb &= !s || r;
-    else if (_AP_Q == AP_RND_CONV)
-      qb &= _AP_ROOT_op_get_bit(Base::V, 0) || r;
-    else if (_AP_Q == AP_TRN_ZERO)
-      qb = s && (qb || r);
-    Base::V += qb;
-    return carry && (!(bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
-  }
-  //  @}
-
- public:
-  /// @name constructors.
-  //  @{
-  /// default ctor.
-  INLINE ap_fixed_base() {}
-
-  /// copy ctor.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    operator=(op);
-    report();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    operator=(op);
-    report();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
-    tmp.V = op.V;
-    operator=(tmp);
-    report();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
-    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
-    tmp.V = op.V;
-    operator=(tmp);
-    report();
-  }
-
-#ifndef __SYNTHESIS__
-#ifndef NON_C99STRING
-  INLINE ap_fixed_base(const char* s, signed char rd = 0) {
-    unsigned char radix = rd;
-    std::string str = ap_private_ops::parseString(s, radix); // will guess rd, default 10
-    _AP_ERROR(radix == 0, "ap_fixed_base(const char* \"%s\", %d), str=%s, radix = %d",
-              s, rd, str.c_str(), radix); // TODO remove this check
-    fromString(str, radix);
-  }
-#else
-  INLINE ap_fixed_base(const char* s, signed char rd = 10) {
-    ap_int_base<_AP_W, _AP_S> t(s, rd);
-    Base::V = t.V;
-  }
-#endif // ifndef NON_C99STRING
-#else // ifndef __SYNTHESIS__
-  // XXX _ssdm_string2bits only takes const string and const radix.
-  // It seems XFORM will do compile time processing of the string.
-  INLINE ap_fixed_base(const char* s) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_I, _AP_S, _AP_Q,
-                      _AP_O, _AP_N, _AP_C99);
-    Base::V = t;
-  }
-  INLINE ap_fixed_base(const char* s, signed char rd) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_I, _AP_S, _AP_Q,
-                      _AP_O, _AP_N, _AP_C99);
-    Base::V = t;
-  }
-#endif // ifndef __SYNTHESIS__ else
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
-    *this = ((bool)op);
-    report();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const ap_range_ref<_AP_W2, _AP_S2>& op) {
-    *this = (ap_int_base<_AP_W2, false>(op));
-    report();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_fixed_base(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) {
-    *this = (ap_int_base<_AP_W2 + _AP_W3, false>(op));
-    report();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    *this = (bool(op));
-    report();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    *this = (ap_int_base<_AP_W2, false>(op));
-    report();
-  }
-
-  // ctors from c types.
-  // make a temp ap_fixed_base first, and use ap_fixed_base.operator=
-#define CTOR_FROM_INT(C_TYPE, _AP_W2, _AP_S2)        \
-  INLINE ap_fixed_base(const C_TYPE x) {             \
-    ap_fixed_base<(_AP_W2), (_AP_W2), (_AP_S2)> tmp; \
-    tmp.V = x;                                       \
-    *this = tmp;                                     \
-  }
-
-  CTOR_FROM_INT(bool, 1, false)
-  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  CTOR_FROM_INT(signed char, 8, true)
-  CTOR_FROM_INT(unsigned char, 8, false)
-  CTOR_FROM_INT(short, _AP_SIZE_short, true)
-  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
-  CTOR_FROM_INT(int, _AP_SIZE_int, true)
-  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
-  CTOR_FROM_INT(long, _AP_SIZE_long, true)
-  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
-  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
-  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-#undef CTOR_FROM_INT
-/*
- * TODO:
- *Theere used to be several funtions which were AP_WEAK.
- *Now they're all INLINE expect ap_fixed_base(double d)
- *Maybe we can use '#pragma HLS inline' instead of INLINE.
- */
-  AP_WEAK ap_fixed_base(double d) {
-    ap_int_base<64, false> ireg;
-    ireg.V = doubleToRawBits(d);
-    bool isneg = _AP_ROOT_op_get_bit(ireg.V, 63);
-
-    ap_int_base<DOUBLE_EXP + 1, true> exp;
-    ap_int_base<DOUBLE_EXP, false> exp_tmp;
-    exp_tmp.V =
-        _AP_ROOT_op_get_range(ireg.V, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1);
-    exp = exp_tmp - DOUBLE_BIAS;
-    ap_int_base<DOUBLE_MAN + 2, true> man;
-    man.V = _AP_ROOT_op_get_range(ireg.V, 0, DOUBLE_MAN - 1);
-    // do not support NaN
-    _AP_WARNING(exp == APFX_IEEE_DOUBLE_E_MAX + 1 && man.V != 0,
-                "assign NaN to fixed point value");
-    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
-    if (isneg) man = -man;
-    if ((ireg.V & 0x7fffffffffffffffLL) == 0) {
-      Base::V = 0;
-    } else {
-      int _AP_W2 = DOUBLE_MAN + 2, _AP_I2 = exp.V + 2, _AP_F = _AP_W - _AP_I,
-          F2 = _AP_W2 - _AP_I2;
-      bool _AP_S2 = true,
-           QUAN_INC = F2 > _AP_F &&
-                      !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
-      bool carry = false;
-      // handle quantization
-      unsigned sh_amt = (F2 > _AP_F) ? F2 - _AP_F : _AP_F - F2;
-      if (F2 == _AP_F)
-        Base::V = man.V;
-      else if (F2 > _AP_F) {
-        if (sh_amt < DOUBLE_MAN + 2)
-          Base::V = man.V >> sh_amt;
-        else {
-          Base::V = isneg ? -1 : 0;
-        }
-        if ((_AP_Q != AP_TRN) && !((_AP_Q == AP_TRN_ZERO) && !_AP_S2)) {
-          bool qb = (F2 - _AP_F > _AP_W2) ? isneg : (bool)_AP_ROOT_op_get_bit(
-                                                        man.V, F2 - _AP_F - 1);
-          bool r =
-              (F2 > _AP_F + 1)
-                  ? _AP_ROOT_op_get_range(man.V, 0, (F2 - _AP_F - 2 < _AP_W2)
-                                                        ? (F2 - _AP_F - 2)
-                                                        : (_AP_W2 - 1)) != 0
-                  : false;
-          carry = quantization_adjust(qb, r, isneg);
-        }
-      } else { // no quantization
-        Base::V = man.V;
-        if (sh_amt < _AP_W)
-          Base::V = Base::V << sh_amt;
-        else
-          Base::V = 0;
-      }
-      // handle overflow/underflow
-      if ((_AP_O != AP_WRAP || _AP_N != 0) &&
-          ((!_AP_S && _AP_S2) ||
-           _AP_I - _AP_S <
-               _AP_I2 - _AP_S2 +
-                   (QUAN_INC ||
-                    (_AP_S2 && (_AP_O == AP_SAT_SYM))))) { // saturation
-        bool deleted_zeros = _AP_S2 ? true : !carry, deleted_ones = true;
-        bool neg_src = isneg;
-        bool lD = false;
-        int pos1 = F2 - _AP_F + _AP_W;
-        int pos2 = F2 - _AP_F + _AP_W + 1;
-        bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-        if (pos1 < _AP_W2 && pos1 >= 0)
-          // lD = _AP_ROOT_op_get_bit(man.V, pos1);
-          lD = (man.V >> pos1) & 1;
-        if (pos1 < _AP_W2) {
-          bool Range1_all_ones = true;
-          bool Range1_all_zeros = true;
-          bool Range2_all_ones = true;
-          ap_int_base<DOUBLE_MAN + 2, false> Range2;
-          ap_int_base<DOUBLE_MAN + 2, false> all_ones(-1);
-
-          if (pos2 >= 0 && pos2 < _AP_W2) {
-            // Range2.V = _AP_ROOT_op_get_range(man.V,
-            //                        pos2, _AP_W2 - 1);
-            Range2.V = man.V;
-            Range2.V >>= pos2;
-            Range2_all_ones = Range2 == (all_ones >> pos2);
-          } else if (pos2 < 0)
-            Range2_all_ones = false;
-          if (pos1 >= 0 && pos2 < _AP_W2) {
-            Range1_all_ones = Range2_all_ones && lD;
-            Range1_all_zeros = !Range2.V && !lD;
-          } else if (pos2 == _AP_W2) {
-            Range1_all_ones = lD;
-            Range1_all_zeros = !lD;
-          } else if (pos1 < 0) {
-            Range1_all_zeros = !man.V;
-            Range1_all_ones = false;
-          }
-
-          deleted_zeros =
-              deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
-          deleted_ones =
-              carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
-          neg_src = isneg && !(carry && Range1_all_ones);
-        } else
-          neg_src = isneg && newsignbit;
-        bool neg_trg = _AP_S && newsignbit;
-        bool overflow = (neg_trg || !deleted_zeros) && !isneg;
-        bool underflow = (!neg_trg || !deleted_ones) && neg_src;
-        if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
-          underflow |=
-              neg_src &&
-              (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
-                         : true);
-        overflow_adjust(underflow, overflow, lD, neg_src);
-      }
-    }
-    report();
-  }
-
-  // TODO more optimized implementation.
-  INLINE ap_fixed_base(float d) { *this = ap_fixed_base(double(d)); }
-
-#if _AP_ENABLE_HALF_ == 1
-  // TODO more optimized implementation.
-  INLINE ap_fixed_base(half d) { *this = ap_fixed_base(double(d)); }
-#endif
-  //  @}
-
-  /// @name assign operator
-  /// assign, using another ap_fixed_base of same template parameters.
-  /*
-  INLINE ap_fixed_base& operator=(
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-  */
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-
-    const int _AP_F = _AP_W - _AP_I;
-    const int F2 = _AP_W2 - _AP_I2;
-    const int QUAN_INC =
-          F2 > _AP_F && !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
-
-    if (!op) Base::V = 0;
-    bool carry = false;
-    bool signbit = _AP_ROOT_op_get_bit(op.V, _AP_W2 - 1);
-    bool isneg = signbit && _AP_S2;
-    if (F2 == _AP_F)
-      Base::V = op.V;
-    else if (F2 > _AP_F) {
-      unsigned int sh_amt = F2 - _AP_F;
-      //  moves bits right, handle quantization.
-      if (sh_amt < _AP_W2) {
-        Base::V = op.V >> sh_amt;
-      } else {
-        Base::V = isneg ? -1 : 0;
-      }
-      if (_AP_Q != AP_TRN && !(_AP_Q == AP_TRN_ZERO && !_AP_S2)) {
-        bool qbit = _AP_ROOT_op_get_bit(op.V, F2 - _AP_F - 1);
-        // bit after LSB.
-        bool qb = (F2 - _AP_F > _AP_W2) ? _AP_S2 && signbit : qbit;
-        enum { hi = ((F2 - _AP_F - 2) < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1) };
-        // bits after qb.
-        bool r = (F2 > _AP_F + 1) ? (_AP_ROOT_op_get_range(op.V, 0, hi) != 0) : false;
-        carry = quantization_adjust(qb, r, isneg);
-      }
-    } else {
-      unsigned  sh_amt = _AP_F - F2;
-      // moves bits left, no quantization
-      if (sh_amt < _AP_W) {
-        if (_AP_W > _AP_W2) {
-          // extend and then shift, avoid losing bits.
-          Base::V = op.V;
-          Base::V <<= sh_amt;
-        } else {
-          // shift and truncate.
-          Base::V = op.V << sh_amt;
-        }
-      } else {
-        Base::V = 0;
-      }
-    }
-    // handle overflow/underflow
-    if ((_AP_O != AP_WRAP || _AP_N != 0) &&
-        ((!_AP_S && _AP_S2) ||
-         _AP_I - _AP_S <
-             _AP_I2 - _AP_S2 +
-                 (QUAN_INC || (_AP_S2 && _AP_O == AP_SAT_SYM)))) { // saturation
-      bool deleted_zeros = _AP_S2 ? true : !carry;
-      bool deleted_ones = true;
-      bool neg_src = isneg;
-      bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-      enum { pos1 = F2 - _AP_F + _AP_W, pos2 = F2 - _AP_F + _AP_W + 1 };
-      bool lD = (pos1 < _AP_W2 && pos1 >= 0) ? _AP_ROOT_op_get_bit(op.V, pos1)
-                                             : false;
-      if (pos1 < _AP_W2) {
-        bool Range1_all_ones = true;
-        bool Range1_all_zeros = true;
-        bool Range2_all_ones = true;
-        ap_int_base<_AP_W2, false> all_ones(-1);
-
-        if (pos2 < _AP_W2 && pos2 >= 0) {
-          ap_int_base<_AP_W2, false> Range2;
-          Range2.V = _AP_ROOT_op_get_range(op.V, pos2, _AP_W2 - 1);
-          Range2_all_ones = Range2 == (all_ones >> pos2);
-        } else if (pos2 < 0) {
-          Range2_all_ones = false;
-        }
-
-        if (pos1 >= 0 && pos2 < _AP_W2) {
-          ap_int_base<_AP_W2, false> Range1;
-          Range1.V = _AP_ROOT_op_get_range(op.V, pos1, _AP_W2 - 1);
-          Range1_all_ones = Range1 == (all_ones >> pos1);
-          Range1_all_zeros = !Range1.V;
-        } else if (pos2 == _AP_W2) {
-          Range1_all_ones = lD;
-          Range1_all_zeros = !lD;
-        } else if (pos1 < 0) {
-          Range1_all_zeros = !op.V;
-          Range1_all_ones = false;
-        }
-
-        deleted_zeros =
-            deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
-        deleted_ones =
-            carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
-        neg_src = isneg && !(carry && Range1_all_ones);
-      } else
-        neg_src = isneg && newsignbit;
-      bool neg_trg = _AP_S && newsignbit;
-      bool overflow = (neg_trg || !deleted_zeros) && !isneg;
-      bool underflow = (!neg_trg || !deleted_ones) && neg_src;
-      if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
-        underflow |=
-            neg_src &&
-            (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
-                       : true);
-
-      overflow_adjust(underflow, overflow, lD, neg_src);
-    }
-    return *this;
-  } // operator= 
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator=(
-      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    operator=(const_cast<const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(op));
-    return *this;
-  }
-
-  /// Set this ap_fixed_base with ULL.
-  INLINE ap_fixed_base& setBits(ap_ulong bv) {
-    // TODO when ull is not be long enough...
-    Base::V = bv;
-    return *this;
-  }
-
-  /// Return a ap_fixed_base object whose this->V is assigned by bv.
-  static INLINE ap_fixed_base bitsToFixed(ap_ulong bv) {
-    // TODO fix when ull is not be long enough...
-    ap_fixed_base t;
-#ifdef __SYNTHESIS__
-    t.V = bv;
-#else
-    t.V.set_bits(bv);
-#endif
-    return t;
-  }
-
-  // Explicit conversion functions to ap_int_base.
-  /** Captures all integer bits, in truncate mode.
-   *  @param[in] Cnative follow conversion from double to int.
-   */
-  INLINE ap_int_base<AP_MAX(_AP_I, 1), _AP_S> to_ap_int_base(
-      bool Cnative = true) const {
-    ap_int_base<AP_MAX(_AP_I, 1), _AP_S> ret;
-    if (_AP_I == 0) {
-      ret.V = 0;
-    } else if (_AP_I > 0 && _AP_I <= _AP_W) {
-      ret.V = _AP_ROOT_op_get_range(Base::V, _AP_W - _AP_I, _AP_W - 1);
-    } else if (_AP_I > _AP_W) {
-      ret.V = _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 1);
-      ret.V <<= (_AP_I - _AP_W);
-    }
-    /* Consider the following case
-     *   float f = -7.5f;
-     *   ap_fixed<8,4> t = f;  // -8 0 0 0 . 0.5
-     *   int i = t.to_int();
-     * the result should be -7 instead of -8.
-     * Therefore, after truncation, the value should be increated by 1.
-     * For (-1, 0), carry to MSB will happen, but result 0 is still correct.
-     */
-    if (Cnative && _AP_I < _AP_W) {
-      // Follow C native data type, conversion from double to int
-      if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1) && (_AP_I < _AP_W) &&
-          (_AP_ROOT_op_get_range(
-               Base::V, 0, _AP_I < 0 ? _AP_W - 1 : _AP_W - _AP_I - 1) != 0))
-        ++ret;
-    } else {
-      // Follow OSCI library, conversion from sc_fixed to sc_int
-    }
-    return ret;
-  };
-
- public:
-  template <int _AP_W2, bool _AP_S2>
-  INLINE operator ap_int_base<_AP_W2, _AP_S2>() const {
-    return ap_int_base<_AP_W2, _AP_S2>(to_ap_int_base());
-  }
-
-  // Explicit conversion function to C built-in integral type.
-  INLINE char to_char() const { return to_ap_int_base().to_char(); }
-
-  INLINE int to_int() const { return to_ap_int_base().to_int(); }
-
-  INLINE unsigned to_uint() const { return to_ap_int_base().to_uint(); }
-
-  INLINE ap_slong to_int64() const { return to_ap_int_base().to_int64(); }
-
-  INLINE ap_ulong to_uint64() const { return to_ap_int_base().to_uint64(); }
-
-  /// covert function to double.
-  /** only round-half-to-even mode supported, does not obey FE env. */
-  INLINE double to_double() const {
-#if defined(AP_FIXED_ENABLE_CPP_FENV)
-    _AP_WARNING(std::fegetround() != FE_TONEAREST,
-                "Only FE_TONEAREST is supported");
-#endif
-    enum { BITS = DOUBLE_MAN + DOUBLE_EXP + 1 };
-    if (!Base::V) return 0.0f;
-    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
-    ap_int_base<_AP_W, false> tmp;
-    if (s)
-      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
-    else
-      tmp.V = Base::V;
-    int l = tmp.countLeadingZeros(); ///< number of leading zeros.
-    int e = _AP_I - l - 1 + DOUBLE_BIAS; ///< exponent
-    int lsb_index = _AP_W - l - 1 - DOUBLE_MAN;
-    // more than 0.5?
-    bool a = (lsb_index >=2) ?
-        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
-    // round to even
-    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
-    // ull is at least 64-bit
-    ap_ulong m;
-    // may actually left shift, ensure buffer is wide enough.
-    if (_AP_W > BITS) {
-      m = (lsb_index >= 1) ? (ap_ulong)(tmp.V >> (lsb_index - 1))
-                           : (ap_ulong)(tmp.V << (1 - lsb_index));
-    } else {
-      m = (ap_ulong)tmp.V;
-      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
-                           : (m << (1 - lsb_index));
-    }
-    m += a;
-    m >>= 1;
-    //std::cout << '\n' << std::hex << m << '\n'; // TODO delete this
-    // carry to MSB, increase exponent
-    if (_AP_ctype_op_get_bit(m, DOUBLE_MAN + 1)) {
-      e += 1;
-    }
-    // set sign and exponent
-    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
-    //std::cout << m << '\n'; // TODO delete this
-    m = _AP_ctype_op_set_range(m, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1, e);
-    //std::cout << std::hex << m << std::dec << std::endl; // TODO delete this
-    // cast to fp
-    return rawBitsToDouble(m);
-  }
-
-  /// convert function to float.
-  /** only round-half-to-even mode supported, does not obey FE env. */
-  INLINE float to_float() const {
-#if defined(AP_FIXED_ENABLE_CPP_FENV)
-    _AP_WARNING(std::fegetround() != FE_TONEAREST,
-                "Only FE_TONEAREST is supported");
-#endif
-    enum { BITS = FLOAT_MAN + FLOAT_EXP + 1 };
-    if (!Base::V) return 0.0f;
-    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
-    ap_int_base<_AP_W, false> tmp;
-    if (s)
-      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
-    else
-      tmp.V = Base::V;
-    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
-    int e = _AP_I - l - 1 + FLOAT_BIAS; ///< exponent
-    int lsb_index = _AP_W - l - 1 - FLOAT_MAN;
-    // more than 0.5?
-    bool a = (lsb_index >=2) ?
-        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
-    // round to even
-    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
-    // ul is at least 32-bit
-    unsigned long m;
-    // may actually left shift, ensure buffer is wide enough.
-    if (_AP_W > BITS) {
-      m = (lsb_index >= 1) ? (unsigned long)(tmp.V >> (lsb_index - 1))
-                           : (unsigned long)(tmp.V << (1 - lsb_index));
-    } else {
-      m = (unsigned long)tmp.V;
-      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
-                           : (m << (1 - lsb_index));
-    }
-    m += a;
-    m >>= 1;
-    // carry to MSB, increase exponent
-    if (_AP_ctype_op_get_bit(m, FLOAT_MAN + 1)) {
-      e += 1;
-    }
-    // set sign and exponent
-    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
-    m = _AP_ctype_op_set_range(m, FLOAT_MAN, FLOAT_MAN + FLOAT_EXP - 1, e);
-    // cast to fp
-    return rawBitsToFloat(m);
-  }
-
-#if _AP_ENABLE_HALF_ == 1
-  /// convert function to half.
-  /** only round-half-to-even mode supported, does not obey FE env. */
-  INLINE half to_half() const {
-#if defined(AP_FIXED_ENABLE_CPP_FENV)
-    _AP_WARNING(std::fegetround() != FE_TONEAREST,
-                "Only FE_TONEAREST is supported");
-#endif
-    enum { BITS = HALF_MAN + HALF_EXP + 1 };
-    if (!Base::V) return 0.0f;
-    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
-    ap_int_base<_AP_W, false> tmp;
-    if (s)
-      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
-    else
-      tmp.V = Base::V;
-    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
-    int e = _AP_I - l - 1 + HALF_BIAS; ///< exponent
-    int lsb_index = _AP_W - l - 1 - HALF_MAN;
-    // more than 0.5?
-    bool a = (lsb_index >=2) ?
-        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
-    // round to even
-    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
-    // short is at least 16-bit
-    unsigned short m;
-    // may actually left shift, ensure buffer is wide enough.
-    if (_AP_W > BITS) {
-      m = (lsb_index >= 1) ? (unsigned short)(tmp.V >> (lsb_index - 1))
-                           : (unsigned short)(tmp.V << (1 - lsb_index));
-    } else {
-      m = (unsigned short)tmp.V;
-      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
-                           : (m << (1 - lsb_index));
-    }
-    m += a;
-    m >>= 1;
-    // carry to MSB, increase exponent
-    if (_AP_ctype_op_get_bit(m, HALF_MAN + 1)) {
-      e += 1;
-    }
-    // set sign and exponent
-    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
-    m = _AP_ctype_op_set_range(m, HALF_MAN, HALF_MAN + HALF_EXP - 1, e);
-    // cast to fp
-    return rawBitsToHalf(m);
-  }
-#endif
-
-  // FIXME inherited from old code, this may loose precision!
-  INLINE operator long double() const { return (long double)to_double(); }
-
-  INLINE operator double() const { return to_double(); }
-
-  INLINE operator float() const { return to_float(); }
-
-#if _AP_ENABLE_HALF_ == 1
-  INLINE operator half() const { return to_half(); }
-#endif
-
-  INLINE operator bool() const { return (bool)Base::V != 0; }
-
-  INLINE operator char() const { return (char)to_int(); }
-
-  INLINE operator signed char() const { return (signed char)to_int(); }
-
-  INLINE operator unsigned char() const { return (unsigned char)to_uint(); }
-
-  INLINE operator short() const { return (short)to_int(); }
-
-  INLINE operator unsigned short() const { return (unsigned short)to_uint(); }
-
-  INLINE operator int() const { return to_int(); }
-
-  INLINE operator unsigned int() const { return to_uint(); }
-
-// FIXME don't assume data width...
-#ifdef __x86_64__
-  INLINE operator long() const { return (long)to_int64(); }
-
-  INLINE operator unsigned long() const { return (unsigned long)to_uint64(); }
-#else
-  INLINE operator long() const { return (long)to_int(); }
-
-  INLINE operator unsigned long() const { return (unsigned long)to_uint(); }
-#endif // ifdef __x86_64__ else
-
-  INLINE operator ap_ulong() const { return to_uint64(); }
-
-  INLINE operator ap_slong() const { return to_int64(); }
-
-  INLINE int length() const { return _AP_W; };
-
-  // bits_to_int64 deleted.
-#ifndef __SYNTHESIS__
-  // Used in autowrap, when _AP_W < 64.
-  INLINE ap_ulong bits_to_uint64() const {
-    return (Base::V).to_uint64();
-  }
-#endif
-
-  // Count the number of zeros from the most significant bit
-  // to the first one bit. Note this is only for ap_fixed_base whose
-  // _AP_W <= 64, otherwise will incur assertion.
-  INLINE int countLeadingZeros() {
-#ifdef __SYNTHESIS__
-    // TODO: used llvm.ctlz intrinsic ?
-    if (_AP_W <= 32) {
-      ap_int_base<32, false> t(-1ULL);
-      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
-      return __builtin_ctz(t.V);
-    } else if (_AP_W <= 64) {
-      ap_int_base<64, false> t(-1ULL);
-      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
-      return __builtin_ctzll(t.V);
-    } else {
-      enum {__N = (_AP_W + 63) / 64};
-      int NZeros = 0;
-      int i = 0;
-      bool hitNonZero = false;
-      for (i = 0; i < __N - 1; ++i) {
-        ap_int_base<64, false> t;
-        t.range(0, 63) = this->range(_AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
-        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V);
-        hitNonZero |= (t != 0);
-      }
-      if (!hitNonZero) {
-        ap_int_base<64, false> t(-1ULL);
-        t.range(63 - (_AP_W - 1) % 64, 63) = this->range(0, (_AP_W - 1) % 64);
-        NZeros += __builtin_clzll(t.V);
-      }
-      return NZeros;
-    }
-#else
-    return Base::V.countLeadingZeros();
-#endif
-  }
-
-  // Arithmetic : Binary
-  // -------------------------------------------------------------------------
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::mult operator*(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
-      const {
-    typename RType<_AP_W2, _AP_I2, _AP_S2>::mult_base r, t;
-    r.V = Base::V;
-    t.V = op2.V;
-    r.V *= op2.V;
-    return r;
-  }
-
-  // multiply function deleted.
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::div operator/(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
-      const {
-    typename RType<_AP_W2, _AP_I2, _AP_S2>::div_base r;
-#ifndef __SYNTHESIS__
-    enum {F2 = _AP_W2-_AP_I2,
-              _W1=AP_MAX(_AP_W + AP_MAX(F2, 0) + ((_AP_S2 && !_AP_S) ? 1 : 0), _AP_W2 + ((_AP_S && !_AP_S2) ? 1 : 0))};
-    ap_int_base<_W1,_AP_S||_AP_S2> dividend,divisior;
-    ap_int_base<_W1,_AP_S> tmp1;
-    ap_int_base<_W1,_AP_S2> tmp2;
-    tmp1.V = Base::V;
-    tmp1.V <<= AP_MAX(F2,0);
-    tmp2.V = op2.V;
-    dividend = tmp1;
-    divisior = tmp2;
-    r.V = ((_AP_S||_AP_S2) ? dividend.V.sdiv(divisior.V): dividend.V.udiv(divisior.V));
-#else
-    #ifndef __SC_COMPATIBLE__
-        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0),_AP_I, _AP_S> t(*this);
-    #else
-        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0) + AP_MAX(_AP_I2, 0),_AP_I, _AP_S> t(*this);
-    #endif
-        r.V = t.V / op2.V;
-#endif
-/*
-    enum {
-      F2 = _AP_W2 - _AP_I2,
-      shl = AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
-#ifndef __SC_COMPATIBLE__
-      shr = AP_MAX(_AP_I2, 0),
-#else
-      shr = 0,
-#endif
-      W3 = _AP_S2 + _AP_W + shl,
-      S3 = _AP_S || _AP_S2,
-    };
-    ap_int_base<W3, S3> dividend, t;
-    dividend.V = Base::V;
-    // multiply both by (1 << F2), and than do integer division.
-    dividend.V <<= (int) shl;
-#ifdef __SYNTHESIS__
-    // .V's have right signedness, and will have right extending.
-    t.V = dividend.V / op2.V;
-#else
-    // XXX op2 may be wider than dividend, and sdiv and udiv takes the same with
-    // as left hand operand, so data might be truncated by mistake if not
-    // handled here.
-    t.V = S3 ? dividend.V.sdiv(op2.V) : dividend.V.udiv(op2.V);
-#endif
-    r.V = t.V >> (int) shr;
-*/
-    return r;
-  }
-
-#define OP_BIN_AF(Sym, Rty)                                                \
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
-            ap_o_mode _AP_O2, int _AP_N2>                                  \
-  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty operator Sym(         \
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
-          op2) const {                                                     \
-    typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty##_base ret, lhs(*this),    \
-        rhs(op2);                                                          \
-    ret.V = lhs.V Sym rhs.V;                                               \
-    return ret;                                                            \
-  }
-
-  OP_BIN_AF(+, plus)
-  OP_BIN_AF(-, minus)
-  OP_BIN_AF(&, logic)
-  OP_BIN_AF(|, logic)
-  OP_BIN_AF(^, logic)
-
-// Arithmetic : assign
-// -------------------------------------------------------------------------
-#define OP_ASSIGN_AF(Sym)                                                  \
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
-            ap_o_mode _AP_O2, int _AP_N2>                                  \
-  INLINE ap_fixed_base& operator Sym##=(                                   \
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
-          op2) {                                                           \
-    *this = operator Sym(op2);                                             \
-    return *this;                                                          \
-  }
-
-  OP_ASSIGN_AF(*)
-  OP_ASSIGN_AF(/)
-  OP_ASSIGN_AF(+)
-  OP_ASSIGN_AF(-)
-  OP_ASSIGN_AF(&)
-  OP_ASSIGN_AF(|)
-  OP_ASSIGN_AF(^)
-
-  // Prefix and postfix increment and decrement.
-  // -------------------------------------------------------------------------
-
-  /// Prefix increment
-  INLINE ap_fixed_base& operator++() {
-    operator+=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
-    return *this;
-  }
-
-  /// Prefix decrement.
-  INLINE ap_fixed_base& operator--() {
-    operator-=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
-    return *this;
-  }
-
-  /// Postfix increment
-  INLINE const ap_fixed_base operator++(int) {
-    ap_fixed_base r(*this);
-    operator++();
-    return r;
-  }
-
-  /// Postfix decrement
-  INLINE const ap_fixed_base operator--(int) {
-    ap_fixed_base r(*this);
-    operator--();
-    return r;
-  }
-
-  // Unary arithmetic.
-  // -------------------------------------------------------------------------
-  INLINE ap_fixed_base operator+() { return *this; }
-
-  INLINE ap_fixed_base<_AP_W + 1, _AP_I + 1, true> operator-() const {
-    ap_fixed_base<_AP_W + 1, _AP_I + 1, true> r(*this);
-    r.V = -r.V;
-    return r;
-  }
-
-  INLINE ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> getNeg() {
-    ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> r(*this);
-    r.V = -r.V;
-    return r;
-  }
-
-  // Not (!)
-  // -------------------------------------------------------------------------
-  INLINE bool operator!() const { return Base::V == 0; }
-
-  // Bitwise complement
-  // -------------------------------------------------------------------------
-  // XXX different from Mentor's ac_fixed.
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S> operator~() const {
-    ap_fixed_base<_AP_W, _AP_I, _AP_S> r;
-    r.V = ~Base::V;
-    return r;
-  }
-
-  // Shift
-  // -------------------------------------------------------------------------
-  // left shift is the same as moving point right, i.e. increate I.
-  template <int _AP_SHIFT>
-  INLINE ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> lshift() const {
-    ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> r;
-    r.V = Base::V;
-    return r;
-  }
-
-  template <int _AP_SHIFT>
-  INLINE ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> rshift() const {
-    ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> r;
-    r.V = Base::V;
-    return r;
-  }
-
-  // Because the return type is the type of the the first operand, shift assign
-  // operators do not carry out any quantization or overflow
-  // While systemc, shift assigns for sc_fixed/sc_ufixed will result in
-  // quantization or overflow (depending on the mode of the first operand)
-  INLINE ap_fixed_base operator<<(unsigned int sh) const {
-    ap_fixed_base r;
-    r.V = Base::V << sh;
-// TODO check shift overflow?
-#ifdef __SC_COMPATIBLE__
-    if (sh == 0) return r;
-    if (_AP_O != AP_WRAP || _AP_N != 0) {
-      bool neg_src = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-      bool allones, allzeros;
-      ap_int_base<_AP_W, false> ones(-1);
-      if (sh <= _AP_W) {
-        ap_int_base<_AP_W, false> range1;
-        range1.V = _AP_ROOT_op_get_range(
-            const_cast<ap_fixed_base*>(this)->Base::V, _AP_W - sh, _AP_W - 1);
-        allones = range1 == (ones >> (_AP_W - sh));
-        allzeros = range1 == 0;
-      } else {
-        allones = false;
-        allzeros = Base::V == 0;
-      }
-      bool overflow = !allzeros && !neg_src;
-      bool underflow = !allones && neg_src;
-      if ((_AP_O == AP_SAT_SYM) && _AP_S)
-        underflow |=
-            neg_src &&
-            (_AP_W > 1 ? _AP_ROOT_op_get_range(r.V, 0, _AP_W - 2) == 0 : true);
-      bool lD = false;
-      if (sh < _AP_W) lD = _AP_ROOT_op_get_bit(Base::V, _AP_W - sh - 1);
-      r.overflow_adjust(underflow, overflow, lD, neg_src);
-    }
-#endif
-    return r;
-  }
-
-  INLINE ap_fixed_base operator>>(unsigned int sh) const {
-    ap_fixed_base r;
-    r.V = Base::V >> sh;
-// TODO check shift overflow?
-#ifdef __SC_COMPATIBLE__
-    if (sh == 0) return r;
-    if (_AP_Q != AP_TRN) {
-      bool qb = false;
-      if (sh <= _AP_W) qb = _AP_ROOT_op_get_bit(Base::V, sh - 1);
-      bool rb = false;
-      if (sh > 1 && sh <= _AP_W)
-        rb = _AP_ROOT_op_get_range(const_cast<ap_fixed_base*>(this)->Base::V, 0,
-                                   sh - 2) != 0;
-      else if (sh > _AP_W)
-        rb = Base::V != 0;
-      r.quantization_adjust(qb, rb,
-                            _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
-    }
-#endif
-    return r;
-  }
-
-  // left and right shift for int
-  INLINE ap_fixed_base operator<<(int sh) const {
-    ap_fixed_base r;
-    bool isNeg = sh < 0;
-    unsigned int ush = isNeg ? -sh : sh;
-    if (isNeg) {
-      return operator>>(ush);
-    } else {
-      return operator<<(ush);
-    }
-  }
-
-  INLINE ap_fixed_base operator>>(int sh) const {
-    bool isNeg = sh < 0;
-    unsigned int ush = isNeg ? -sh : sh;
-    if (isNeg) {
-      return operator<<(ush);
-    } else {
-      return operator>>(ush);
-    }
-  }
-
-  // left and right shift for ap_int.
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, true>& op2) const {
-    // TODO the code seems not optimal. ap_fixed<8,8> << ap_int<2> needs only a
-    // small mux, but integer need a big one!
-    int sh = op2.to_int();
-    return operator<<(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, true>& op2) const {
-    int sh = op2.to_int();
-    return operator>>(sh);
-  }
-
-  // left and right shift for ap_uint.
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, false>& op2) const {
-    unsigned int sh = op2.to_uint();
-    return operator<<(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, false>& op2) const {
-    unsigned int sh = op2.to_uint();
-    return operator>>(sh);
-  }
-
-  // left and right shift for ap_fixed
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base operator<<(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          op2) {
-    return operator<<(op2.to_ap_int_base());
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base operator>>(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          op2) {
-    return operator>>(op2.to_ap_int_base());
-  }
-
-  // Shift assign.
-  // -------------------------------------------------------------------------
-
-  // left shift assign.
-  INLINE ap_fixed_base& operator<<=(const int sh) {
-    *this = operator<<(sh);
-    return *this;
-  }
-
-  INLINE ap_fixed_base& operator<<=(const unsigned int sh) {
-    *this = operator<<(sh);
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base& operator<<=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
-    *this = operator<<(sh.to_int());
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator<<=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          sh) {
-    *this = operator<<(sh.to_int());
-    return *this;
-  }
-
-  // right shift assign.
-  INLINE ap_fixed_base& operator>>=(const int sh) {
-    *this = operator>>(sh);
-    return *this;
-  }
-
-  INLINE ap_fixed_base& operator>>=(const unsigned int sh) {
-    *this = operator>>(sh);
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base& operator>>=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
-    *this = operator>>(sh.to_int());
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator>>=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          sh) {
-    *this = operator>>(sh.to_int());
-    return *this;
-  }
-
-// Comparisons.
-// -------------------------------------------------------------------------
-#define OP_CMP_AF(Sym)                                                         \
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,             \
-            ap_o_mode _AP_O2, int _AP_N2>                                      \
-  INLINE bool operator Sym(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, \
-                                               _AP_O2, _AP_N2>& op2) const {   \
-    enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 };                      \
-    if (_AP_F == F2)                                                           \
-      return Base::V Sym op2.V;                                                \
-    else if (_AP_F > F2)                                                       \
-      return Base::V Sym ap_fixed_base<AP_MAX(_AP_W2 + _AP_F - F2, 1), _AP_I2, \
-                                       _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; \
-    else                                                                       \
-      return ap_fixed_base<AP_MAX(_AP_W + F2 - _AP_F + 1, 1), _AP_I + 1,       \
-                           _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V Sym op2.V;     \
-    return false;                                                              \
-  }
-
-  OP_CMP_AF(>)
-  OP_CMP_AF(<)
-  OP_CMP_AF(>=)
-  OP_CMP_AF(<=)
-  OP_CMP_AF(==)
-  OP_CMP_AF(!=)
-// FIXME: Move compare with double out of struct ap_fixed_base defination
-//        and combine it with compare operator(double, ap_fixed_base)
-#define DOUBLE_CMP_AF(Sym) \
-  INLINE bool operator Sym(double d) const { return to_double() Sym d; }
-
-  DOUBLE_CMP_AF(>)
-  DOUBLE_CMP_AF(<)
-  DOUBLE_CMP_AF(>=)
-  DOUBLE_CMP_AF(<=)
-  DOUBLE_CMP_AF(==)
-  DOUBLE_CMP_AF(!=)
-
-  // Bit and Slice Select
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
-      unsigned index) {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
-                                                                index.to_int());
-  }
-
-  INLINE bool operator[](unsigned index) const {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
-  }
-
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
-      unsigned index) {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
-                                                                index.to_int());
-  }
-
-  INLINE bool bit(unsigned index) const {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
-  }
-
-  template <int _AP_W2>
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
-      const ap_int_base<_AP_W2, true>& index) {
-    _AP_WARNING(index < _AP_I - _AP_W,
-                "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
-        this, index.to_int() + _AP_W - _AP_I);
-  }
-
-  INLINE bool get_bit(int index) const {
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
-                               index + _AP_W - _AP_I);
-  }
-#if 0
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
-      int index) {
-    _AP_WARNING(index < _AP_I - _AP_W,
-              "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
-        this, index + _AP_W - _AP_I);
-  }
-#endif
-
-  template <int _AP_W2>
-  INLINE bool get_bit(const ap_int_base<_AP_W2, true>& index) const {
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
-                               index.to_int() + _AP_W - _AP_I);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(int Hi,
-                                                                      int Lo) {
-    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
-    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, Hi, Lo);
-  }
-
-  // This is a must to strip constness to produce reference type.
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
-      int Hi, int Lo) const {
-    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
-    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
-        const_cast<ap_fixed_base*>(this), Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() const {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      int Hi, int Lo) {
-    return this->range(Hi, Lo);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      int Hi, int Lo) const {
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE bool is_zero() const { return Base::V == 0; }
-
-  INLINE bool is_neg() const {
-    if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)) return true;
-    return false;
-  }
-
-  INLINE int wl() const { return _AP_W; }
-
-  INLINE int iwl() const { return _AP_I; }
-
-  INLINE ap_q_mode q_mode() const { return _AP_Q; }
-
-  INLINE ap_o_mode o_mode() const { return _AP_O; }
-
-  INLINE int n_bits() const { return _AP_N; }
-
-  // print a string representation of this number in the given radix.
-  // Radix support is 2, 8, 10, or 16.
-  // The result will include a prefix indicating the radix, except for decimal,
-  // where no prefix is needed.  The default is to output a signed representation
-  // of signed numbers, or an unsigned representation  of unsigned numbers.  For
-  // non-decimal formats, this can be changed by the 'sign' argument.
-#ifndef __SYNTHESIS__
-  std::string to_string(unsigned char radix = 2, bool sign = _AP_S) const {
-    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
-    // initialize sc_lv, which seems incapable of handling format "-0b".
-    if (radix == 2) sign = false;
-
-    std::string str;
-    str.clear();
-    char step = 0;
-    bool isNeg = sign && (Base::V < 0);
-
-    // Extend to take care of the -MAX case.
-    ap_fixed_base<_AP_W + 1, _AP_I + 1> tmp(*this);
-    if (isNeg) {
-      tmp = -tmp;
-      str += '-';
-    }
-    std::string prefix;
-    switch (radix) {
-      case 2:
-        prefix = "0b";
-        step = 1;
-        break;
-      case 8:
-        prefix = "0o";
-        step = 3;
-        break;
-      case 16:
-        prefix = "0x";
-        step = 4;
-        break;
-      default:
-        break;
-    }
-
-    if (_AP_I > 0) {
-      // Note we drop the quantization and rounding flags here.  The
-      // integer part is always in range, and the fractional part we
-      // want to drop.  Also, the number is always positive, because
-      // of the absolute value above.
-      ap_int_base<AP_MAX(_AP_I + 1, 1), false> int_part;
-      //   [1] [ I ] d [ W - I ]
-      //    |     |            |
-      //    |    W-I           0
-      //    W
-      int_part.V = _AP_ROOT_op_get_range(
-          tmp.V, _AP_W - _AP_I, _AP_W);
-      str += int_part.to_string(radix, false);
-    } else {
-      str += prefix;
-      str += '0';
-    }
-
-    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 1), 0, false> frac_part = tmp;
-
-    if (radix == 10) {
-      if (frac_part != 0) {
-        str += ".";
-        while (frac_part != 0) {
-          char digit = (frac_part * radix).to_char();
-          str += static_cast<char>(digit + '0');
-          frac_part *= radix;
-        }
-      }
-    } else {
-      if (frac_part != 0) {
-        str += ".";
-        for (signed i = _AP_W - _AP_I - 1; i >= 0; i -= step) {
-          char digit = frac_part.range(i, AP_MAX(0, i - step + 1)).to_char();
-          // If we have a partial bit pattern at the end, then we need
-          // to put it in the high-order bits of 'digit'.
-          int offset = AP_MIN(0, i - step + 1);
-          digit <<= -offset;
-          str += digit < 10 ? static_cast<char>(digit + '0')
-                            : static_cast<char>(digit - 10 + 'a');
-        }
-        if (radix == 16)
-          str += "p0"; // C99 Hex constants are required to have an exponent.
-      }
-    }
-    return str;
-  }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string(unsigned char radix = 2, bool sign = _AP_S) const {
-    return 0;
-  }
-#endif
-}; // struct ap_fixed_base.
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_not(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
-  ret.V = ~op.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_and(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  ret.V = op1.V & op2.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_or(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  ret.V = op1.V | op2.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_xor(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  ret.V = op1.V ^ op2.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-          ap_o_mode _AP_O2, int _AP_N2>
-INLINE void neg(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-  ap_fixed_base<_AP_W2 + !_AP_S2, _AP_I2 + !_AP_S2, true, _AP_Q2, _AP_O2,
-                _AP_N2>
-      t;
-  t.V = -op.V;
-  ret = t;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-          ap_o_mode _AP_O2, int _AP_N2>
-INLINE void lshift(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
-    int i) {
-  enum {
-    F2 = _AP_W2 - _AP_I2,
-    _AP_I3 = AP_MAX(_AP_I, _AP_I2),
-    _AP_W3 = _AP_I3 + F2,
-  };
-  // wide buffer
-  ap_fixed_base<_AP_W3, _AP_I3, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
-  t.V = op.V;
-  t.V <<= i; // FIXME overflow?
-  // handle quantization and overflow
-  ret = t;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-          ap_o_mode _AP_O2, int _AP_N2>
-INLINE void rshift(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
-    int i) {
-  enum {
-    F = _AP_W - _AP_I,
-    F2 = _AP_W2 - _AP_I2,
-    F3 = AP_MAX(F, F2),
-    _AP_W3 = _AP_I2 + F3,
-    sh = F - F2,
-  };
-  // wide buffer
-  ap_fixed_base<_AP_W3, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
-  t.V = op.V;
-  if (sh >= 0)
-    t.V <<= (int) sh;
-  t.V >>= i;
-  // handle quantization and overflow
-  ret = t;
-}
-
-//// FIXME
-//// These partial specialization ctors allow code like
-////   char c = 'a';
-////   ap_fixed_base<8, 8, true> x(c);
-//// but what bout ap_fixed_base<9, 9, true> y(c) ?
-//
-
-#ifndef __SYNTHESIS__
-INLINE std::string scientificFormat(std::string& input) {
-  if (input.length() == 0) return input;
-
-  size_t decPosition = input.find('.');
-  if (decPosition == std::string::npos) decPosition = input.length();
-
-  size_t firstNonZeroPos = 0;
-  for (; input[firstNonZeroPos] > '9' || input[firstNonZeroPos] < '1';
-       firstNonZeroPos++)
-    ;
-
-  int exp;
-  if (firstNonZeroPos > decPosition)
-    exp = decPosition - firstNonZeroPos;
-  else
-    exp = decPosition - firstNonZeroPos - 1;
-  std::string expString = "";
-  if (exp == 0)
-    ;
-  else if (exp < 0) {
-    expString += "e-";
-    exp = -exp;
-  } else
-    expString += "e+";
-
-  if (exp < 10 && exp > 0) {
-    expString += '0';
-    expString += (char)('0' + exp);
-  } else if (exp != 0) {
-    std::string tmp;
-
-    std::ostringstream oss;
-    oss << exp;
-
-    tmp = oss.str();
-    expString += tmp;
-  }
-
-  int lastNonZeroPos = (int)(input.length() - 1);
-  for (; lastNonZeroPos >= 0; --lastNonZeroPos)
-    if (input[lastNonZeroPos] <= '9' && input[lastNonZeroPos] > '0') break;
-
-  std::string ans = "";
-  ans += input[firstNonZeroPos];
-  if (firstNonZeroPos != (size_t)lastNonZeroPos) {
-    ans += '.';
-    for (int i = firstNonZeroPos + 1; i <= lastNonZeroPos; i++)
-      if (input[i] != '.') ans += input[i];
-  }
-
-  ans += expString;
-  return ans;
-}
-
-INLINE std::string reduceToPrecision(std::string& input, int precision) {
-  bool isZero = true;
-  size_t inputLen = input.length();
-  for (size_t i = 0; i < inputLen && isZero; i++)
-    if (input[i] != '.' && input[i] != '0') isZero = false;
-  if (isZero) return "0";
-
-  // Find the first valid number, skip '-'
-  int FirstNonZeroPos = 0;
-  int LastNonZeroPos = (int)inputLen - 1;
-  int truncBitPosition = 0;
-  size_t decPosition = input.find('.');
-  for (; input[FirstNonZeroPos] < '1' || input[FirstNonZeroPos] > '9';
-       FirstNonZeroPos++)
-    ;
-
-  for (; input[LastNonZeroPos] < '1' || input[LastNonZeroPos] > '9';
-       LastNonZeroPos--)
-    ;
-
-  if (decPosition == std::string::npos) decPosition = inputLen;
-  // Count the valid number, to decide whether we need to truncate
-  if ((int)decPosition > LastNonZeroPos) {
-    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) return input;
-    truncBitPosition = FirstNonZeroPos + precision;
-  } else if ((int)decPosition < FirstNonZeroPos) { // This is pure decimal
-    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) {
-      if (FirstNonZeroPos - decPosition - 1 < 4) {
-        return input;
-      } else {
-        if (input[0] == '-') {
-          std::string tmp = input.substr(1, inputLen - 1);
-          return std::string("-") + scientificFormat(tmp);
-        } else
-          return scientificFormat(input);
-      }
-    }
-    truncBitPosition = FirstNonZeroPos + precision;
-  } else {
-    if (LastNonZeroPos - FirstNonZeroPos <= precision) return input;
-    truncBitPosition = FirstNonZeroPos + precision + 1;
-  }
-
-  // duplicate the input string, we want to add "0" before the valid numbers
-  // This is easy for quantization, since we may change 9999 to 10000
-  std::string ans = "";
-  std::string dupInput = "0";
-  if (input[0] == '-') {
-    ans += '-';
-    dupInput += input.substr(1, inputLen - 1);
-  } else {
-    dupInput += input.substr(0, inputLen);
-    ++truncBitPosition;
-  }
-
-  // Add 'carry' after truncation, if necessary
-  bool carry = dupInput[truncBitPosition] > '4';
-  for (int i = truncBitPosition - 1; i >= 0 && carry; i--) {
-    if (dupInput[i] == '.') continue;
-    if (dupInput[i] == '9')
-      dupInput[i] = '0';
-    else {
-      ++dupInput[i];
-      carry = false;
-    }
-  }
-
-  // bits outside precision range should be set to 0
-  if (dupInput[0] == '1')
-    FirstNonZeroPos = 0;
-  else {
-    FirstNonZeroPos = 0;
-    while (dupInput[FirstNonZeroPos] < '1' || dupInput[FirstNonZeroPos] > '9')
-      ++FirstNonZeroPos;
-  }
-
-  unsigned it = FirstNonZeroPos;
-  int NValidNumber = 0;
-  while (it < dupInput.length()) {
-    if (dupInput[it] == '.') {
-      ++it;
-      continue;
-    }
-    ++NValidNumber;
-    if (NValidNumber > precision) dupInput[it] = '0';
-    ++it;
-  }
-
-  // Here we wanted to adjust the truncate position and the value
-  decPosition = dupInput.find('.');
-  if (decPosition == std::string::npos) // When this is integer
-    truncBitPosition = (int)dupInput.length();
-  else
-    for (truncBitPosition = (int)(dupInput.length() - 1); truncBitPosition >= 0;
-         --truncBitPosition) {
-      if (dupInput[truncBitPosition] == '.') break;
-      if (dupInput[truncBitPosition] != '0') {
-        truncBitPosition++;
-        break;
-      }
-    }
-
-  if (dupInput[0] == '1')
-    dupInput = dupInput.substr(0, truncBitPosition);
-  else
-    dupInput = dupInput.substr(1, truncBitPosition - 1);
-
-  decPosition = dupInput.find('.');
-  if (decPosition != std::string::npos) {
-    size_t it = 0;
-    for (it = decPosition + 1; dupInput[it] == '0'; it++)
-      ;
-    if (it - decPosition - 1 < 4) {
-      ans += dupInput;
-      return ans;
-    } else {
-      ans += scientificFormat(dupInput);
-      return ans;
-    }
-  } else if ((int)(dupInput.length()) <= precision) {
-    ans += dupInput;
-    return ans;
-  }
-
-  ans += scientificFormat(dupInput);
-  return ans;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void print(
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  if (_AP_I > 0) {
-    ap_int_base<_AP_I, _AP_S> p1;
-    p1.V = x.V >> (_AP_W - _AP_I);
-    print(p1.V); // print overlaod for .V should exit
-  } else {
-    printf("0");
-  }
-  printf(".");
-  if (_AP_I < _AP_W) {
-    ap_int_base<_AP_W - _AP_I, false> p2;
-    p2.V = _AP_ROOT_op_get_range(x.V, 0, _AP_W - _AP_I);
-    print(p2.V, false); // print overlaod for .V should exit
-  }
-}
-#endif // ifndef __SYNTHESIS__
-
-// XXX the following two functions have to exist in synthesis,
-// as some old HLS Video Library code uses the ostream overload,
-// although HLS will later delete I/O function call.
-
-/// Output streaming
-//-----------------------------------------------------------------------------
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::ostream& operator<<(
-    std::ostream& out,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  // TODO support std::ios_base::fmtflags
-  unsigned width = out.width();
-  unsigned precision = out.precision();
-  char fill = out.fill();
-  std::string str = x.to_string(10, _AP_S);
-  str = reduceToPrecision(str, precision);
-  if (width > str.length()) {
-    for (unsigned i = 0; i < width - str.length(); ++i)
-      out << fill;
-  }
-  out << str;
-  return out;
-}
-#endif // ifndef __SYNTHESIS__
-
-/// Input streaming
-// -----------------------------------------------------------------------------
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::istream& operator>>(
-    std::istream& in,
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  double d;
-  in >> d;
-  x = ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(d);
-  return in;
-}
-#endif
-#endif // ifndef AP_AUTOCC
-
-/// Operators mixing Integers with ap_fixed_base
-// -----------------------------------------------------------------------------
-#define AF_BIN_OP_WITH_INT_SF(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)     \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,           \
-            ap_o_mode _AP_O, int _AP_N>                                  \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<    \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                     \
-  operator BIN_OP(                                                       \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \
-      C_TYPE i_op) {                                                     \
-    return op.operator BIN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op));        \
-  }
-
-#define AF_BIN_OP_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)           \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
-  operator BIN_OP(                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      C_TYPE i_op) {                                                        \
-    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
-  operator BIN_OP(                                                          \
-      C_TYPE i_op,                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
-  }
-
-#define AF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE bool operator REL_OP(                                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      C_TYPE i_op) {                                                        \
-    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE bool operator REL_OP(                                              \
-      C_TYPE i_op,                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
-  }
-
-#define AF_ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)               \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
-            ap_o_mode _AP_O, int _AP_N>                                        \
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
-  operator ASSIGN_OP(                                                          \
-      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
-      C_TYPE i_op) {                                                           \
-    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }
-
-#define AF_ASSIGN_OP_WITH_INT_SF(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)  \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,       \
-            ap_o_mode _AP_O, int _AP_N>                              \
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&    \
-  operator ASSIGN_OP(                                                \
-      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
-      C_TYPE i_op) {                                                 \
-    return op.operator ASSIGN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op)); \
-  }
-
-#define ALL_AF_OP_WITH_INT(C_TYPE, BITS, SIGN)               \
-  AF_BIN_OP_WITH_INT(+, C_TYPE, (BITS), (SIGN), plus)     \
-  AF_BIN_OP_WITH_INT(-, C_TYPE, (BITS), (SIGN), minus)    \
-  AF_BIN_OP_WITH_INT(*, C_TYPE, (BITS), (SIGN), mult)     \
-  AF_BIN_OP_WITH_INT(/, C_TYPE, (BITS), (SIGN), div)      \
-  AF_BIN_OP_WITH_INT(&, C_TYPE, (BITS), (SIGN), logic)    \
-  AF_BIN_OP_WITH_INT(|, C_TYPE, (BITS), (SIGN), logic)    \
-  AF_BIN_OP_WITH_INT(^, C_TYPE, (BITS), (SIGN), logic)    \
-  AF_BIN_OP_WITH_INT_SF(>>, C_TYPE, (BITS), (SIGN), lhs)  \
-  AF_BIN_OP_WITH_INT_SF(<<, C_TYPE, (BITS), (SIGN), lhs)  \
-                                                          \
-  AF_ASSIGN_OP_WITH_INT(+=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(-=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(*=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(/=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(&=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(|=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(^=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT_SF(>>=, C_TYPE, (BITS), (SIGN)) \
-  AF_ASSIGN_OP_WITH_INT_SF(<<=, C_TYPE, (BITS), (SIGN)) \
-                                                          \
-  AF_REL_OP_WITH_INT(>, C_TYPE, (BITS), (SIGN))           \
-  AF_REL_OP_WITH_INT(<, C_TYPE, (BITS), (SIGN))           \
-  AF_REL_OP_WITH_INT(>=, C_TYPE, (BITS), (SIGN))          \
-  AF_REL_OP_WITH_INT(<=, C_TYPE, (BITS), (SIGN))          \
-  AF_REL_OP_WITH_INT(==, C_TYPE, (BITS), (SIGN))          \
-  AF_REL_OP_WITH_INT(!=, C_TYPE, (BITS), (SIGN))
-
-ALL_AF_OP_WITH_INT(bool, 1, false)
-ALL_AF_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_AF_OP_WITH_INT(signed char, 8, true)
-ALL_AF_OP_WITH_INT(unsigned char, 8, false)
-ALL_AF_OP_WITH_INT(short, _AP_SIZE_short, true)
-ALL_AF_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_AF_OP_WITH_INT(int, _AP_SIZE_int, true)
-ALL_AF_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_AF_OP_WITH_INT(long, _AP_SIZE_long, true)
-ALL_AF_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_AF_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_AF_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef ALL_AF_OP_WITH_INT
-#undef AF_BIN_OP_WITH_INT
-#undef AF_BIN_OP_WITH_INT_SF
-#undef AF_ASSIGN_OP_WITH_INT
-#undef AF_ASSIGN_OP_WITH_INT_SF
-#undef AF_REL_OP_WITH_INT
-
-/*
- * **********************************************************************
- * TODO
- * There is no operator defined with float/double/long double, so that
- * code like
- *   ap_fixed<8,4> a = 1.5f;
- *   a += 0.5f;
- * will fail in compilation.
- * Operator with warning about conversion might be wanted.
- * **********************************************************************
- */
-
-#define AF_BIN_OP_WITH_AP_INT(BIN_OP, RTYPE)                                \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType<    \
-      _AP_W, _AP_I, _AP_S>::RTYPE                                           \
-  operator BIN_OP(                                                          \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
-  }                                                                         \
-                                                                            \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
-  operator BIN_OP(                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
-    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }
-
-#define AF_REL_OP_WITH_AP_INT(REL_OP)                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE bool operator REL_OP(                                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
-    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                         \
-                                                                            \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE bool operator REL_OP(                                              \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
-  }
-
-#define AF_ASSIGN_OP_WITH_AP_INT(ASSIGN_OP)                                    \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
-  operator ASSIGN_OP(                                                          \
-      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                               \
-    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                            \
-                                                                               \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
-  INLINE ap_int_base<_AP_W2, _AP_S2>& operator ASSIGN_OP(                      \
-      ap_int_base<_AP_W2, _AP_S2>& i_op,                                       \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {     \
-    return i_op.operator ASSIGN_OP(op.to_ap_int_base());                       \
-  }
-
-AF_BIN_OP_WITH_AP_INT(+, plus)
-AF_BIN_OP_WITH_AP_INT(-, minus)
-AF_BIN_OP_WITH_AP_INT(*, mult)
-AF_BIN_OP_WITH_AP_INT(/, div)
-AF_BIN_OP_WITH_AP_INT(&, logic)
-AF_BIN_OP_WITH_AP_INT(|, logic)
-AF_BIN_OP_WITH_AP_INT(^, logic)
-
-#undef AF_BIN_OP_WITH_AP_INT
-
-AF_ASSIGN_OP_WITH_AP_INT(+=)
-AF_ASSIGN_OP_WITH_AP_INT(-=)
-AF_ASSIGN_OP_WITH_AP_INT(*=)
-AF_ASSIGN_OP_WITH_AP_INT(/=)
-AF_ASSIGN_OP_WITH_AP_INT(&=)
-AF_ASSIGN_OP_WITH_AP_INT(|=)
-AF_ASSIGN_OP_WITH_AP_INT(^=)
-
-#undef AF_ASSIGN_OP_WITH_AP_INT
-
-AF_REL_OP_WITH_AP_INT(==)
-AF_REL_OP_WITH_AP_INT(!=)
-AF_REL_OP_WITH_AP_INT(>)
-AF_REL_OP_WITH_AP_INT(>=)
-AF_REL_OP_WITH_AP_INT(<)
-AF_REL_OP_WITH_AP_INT(<=)
-
-#undef AF_REL_OP_WITH_AP_INT
-
-// Relational Operators with double
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator==(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator==(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator!=(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator!=(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator>(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator<(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator>=(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator<=(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator<(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator>(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator<=(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator>=(op1);
-}
-
-#endif // ifndef __cplusplus else
-
-#endif // ifndef __AP_FIXED_BASE_H__ else
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_BASE_H__
+#define __AP_FIXED_BASE_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+// for ap_int_base and its reference types.
+#include <ap_int.h>
+#ifndef __SYNTHESIS__
+#if _AP_ENABLE_HALF_ == 1
+// for half type
+#include <hls_half.h>
+#endif
+// for std io
+#include <iostream>
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+#else // __cplusplus
+
+// for warning on unsupported rounding mode in conversion to float/double.
+#if !defined(__SYNTHESIS__) && __cplusplus >= 201103L && \
+    (defined(__gnu_linux__) || defined(_WIN32))
+#define AP_FIXED_ENABLE_CPP_FENV 1
+#include <cfenv>
+#endif
+
+// ----------------------------------------------------------------------
+
+/* Major TODO
+  long double support: constructor, assign and other operators.
+  binary operators with ap_fixed_base and const char*.
+  return ap_fixed/ap_ufixed when result signedness is known.
+*/
+
+// Helper function in conversion to floating point types.
+
+#ifdef __SYNTHESIS__
+#define _AP_ctype_op_get_bit(var, index) _AP_ROOT_op_get_bit(var, index)
+#define _AP_ctype_op_set_bit(var, index, x) _AP_ROOT_op_set_bit(var, index, x)
+#define _AP_ctype_op_get_range(var, low, high) \
+  _AP_ROOT_op_get_range(var, low, high)
+#define _AP_ctype_op_set_range(var, low, high, x) \
+  _AP_ROOT_op_set_range(var, low, high, x)
+#else // ifdef __SYNTHESIS__
+template <typename _Tp1, typename _Tp2>
+inline bool _AP_ctype_op_get_bit(_Tp1& var, const _Tp2& index) {
+  return !!(var & (1ull << (index)));
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ctype_op_set_bit(_Tp1& var, const _Tp2& index, const _Tp3& x) {
+  var |= (((x) ? 1ull : 0ull) << (index));
+  return var;
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ctype_op_get_range(_Tp1& var, const _Tp2& low,
+                                   const _Tp3& high) {
+  _Tp1 r = var;
+  ap_ulong mask = -1ll;
+  mask >>= (sizeof(_Tp1) * 8 - ((high) - (low) + 1));
+  r >>= (low);
+  r &= mask;
+  return r;
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+inline _Tp1 _AP_ctype_op_set_range(_Tp1& var, const _Tp2& low, const _Tp3& high,
+                                   const _Tp4& x) {
+  ap_ulong mask = -1ll;
+  mask >>= (_AP_SIZE_ap_slong - ((high) - (low) + 1));
+  var &= ~(mask << (low));
+  var |= ((mask & x) << (low));
+  return var;
+}
+#endif // ifdef __SYNTHESIS__
+
+
+// trait for letting base class to return derived class.
+// Notice that derived class template is incomplete, and we cannot use
+// the member of the derived class.
+template <int _AP_W2, int _AP_I2, bool _AP_S2>
+struct _ap_fixed_factory;
+template <int _AP_W2, int _AP_I2>
+struct _ap_fixed_factory<_AP_W2, _AP_I2, true> {
+  typedef ap_fixed<_AP_W2, _AP_I2> type;
+};
+template <int _AP_W2, int _AP_I2>
+struct _ap_fixed_factory<_AP_W2, _AP_I2, false> {
+  typedef ap_ufixed<_AP_W2, _AP_I2> type;
+};
+
+/// ap_fixed_base: AutoPilot fixed point.
+/** partial specialization of signed.
+  @tparam _AP_W width.
+  @tparam _AP_I integral part width.
+  @tparam _AP_S signed.
+  @tparam _AP_Q quantization mode. Default is AP_TRN.
+  @tparam _AP_O saturation mode. Default is AP_WRAP.
+  @tparam _AP_N saturation wrap value. Default is 0.
+ */
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct ap_fixed_base : _AP_ROOT_TYPE<_AP_W, _AP_S> {
+ public:
+  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
+  static const int width = _AP_W;
+  static const int iwidth = _AP_I;
+  static const ap_q_mode qmode = _AP_Q;
+  static const ap_o_mode omode = _AP_O;
+
+  /// Return type trait.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2>
+  struct RType {
+    enum {
+      _AP_F = _AP_W - _AP_I,
+      F2 = _AP_W2 - _AP_I2,
+      mult_w = _AP_W + _AP_W2,
+      mult_i = _AP_I + _AP_I2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
+               1 + AP_MAX(_AP_F, F2),
+      plus_i =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1 +
+          AP_MAX(_AP_F, F2),
+      minus_i =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+#ifndef __SC_COMPATIBLE__
+      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0),
+#else
+      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
+#endif
+      div_i = _AP_S2 + _AP_I + F2,
+      div_s = _AP_S || _AP_S2,
+      logic_w =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
+          AP_MAX(_AP_F, F2),
+      logic_i = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+
+    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> lhs;
+    typedef ap_fixed_base<_AP_W2, _AP_I2, _AP_S2> rhs;
+
+    typedef ap_fixed_base<mult_w, mult_i, mult_s> mult_base;
+    typedef ap_fixed_base<plus_w, plus_i, plus_s> plus_base;
+    typedef ap_fixed_base<minus_w, minus_i, minus_s> minus_base;
+    typedef ap_fixed_base<logic_w, logic_i, logic_s> logic_base;
+    typedef ap_fixed_base<div_w, div_i, div_s> div_base;
+    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> arg1_base;
+
+    typedef typename _ap_fixed_factory<mult_w, mult_i, mult_s>::type mult;
+    typedef typename _ap_fixed_factory<plus_w, plus_i, plus_s>::type plus;
+    typedef typename _ap_fixed_factory<minus_w, minus_i, minus_s>::type minus;
+    typedef typename _ap_fixed_factory<logic_w, logic_i, logic_s>::type logic;
+    typedef typename _ap_fixed_factory<div_w, div_i, div_s>::type div;
+    typedef typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type arg1;
+  };
+
+ private:
+#ifndef __SYNTHESIS__
+  // This cannot handle hex float format string.
+  void fromString(const std::string& val, unsigned char radix) {
+    _AP_ERROR(!(radix == 2 || radix == 8 || radix == 10 || radix == 16),
+              "ap_fixed_base::fromString(%s, %d)", val.c_str(), radix);
+
+    Base::V = 0;
+    int startPos = 0;
+    int endPos = val.length();
+    int decPos = val.find(".");
+    if (decPos == -1) decPos = endPos;
+
+    // handle sign
+    bool isNegative = false;
+    if (val[0] == '-') {
+      isNegative = true;
+      ++startPos;
+    } else if (val[0] == '+')
+      ++startPos;
+
+    // If there are no integer bits, e.g.:
+    // .0000XXXX, then keep at least one bit.
+    // If the width is greater than the number of integer bits, e.g.:
+    // XXXX.XXXX, then we keep the integer bits
+    // if the number of integer bits is greater than the width, e.g.:
+    // XXX000 then we keep the integer bits.
+    // Always keep one bit.
+    ap_fixed_base<AP_MAX(_AP_I, 4) + 4, AP_MAX(_AP_I, 4) + 4, false>
+        integer_bits = 0;
+
+    // Figure out if we can shift instead of multiply
+    unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+    //std::cout << "\n\n" << val << "\n";
+    //std::cout << startPos << " " << decPos << " " << endPos << "\n";
+
+    bool sticky_int = false;
+
+    // Traverse the integer digits from the MSD, multiplying by radix as we go.
+    for (int i = startPos; i < decPos; i++) {
+      // Get a digit
+      char cdigit = val[i];
+      if (cdigit == '\0') continue;
+      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
+
+      sticky_int |= integer_bits[AP_MAX(_AP_I, 4) + 4 - 1] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 2] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 4];
+      // Shift or multiply the value by the radix
+      if (shift)
+        integer_bits <<= shift;
+      else
+        integer_bits *= radix;
+
+      // Add in the digit we just interpreted
+      integer_bits += digit;
+      //std::cout << "idigit = " << digit << " " << integer_bits.to_string()
+      //    << "  " << sticky_int <<  "\n";
+    }
+    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] =
+        integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] | sticky_int;
+
+    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 0) + 4 + 4, 4, false> fractional_bits = 0;
+    bool sticky = false;
+
+    // Traverse the fractional digits from the LSD, dividing by radix as we go.
+    for (int i = endPos - 1; i >= decPos + 1; i--) {
+      // Get a digit
+      char cdigit = val[i];
+      if (cdigit == '\0') continue;
+      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
+      // Add in the digit we just interpreted
+      fractional_bits += digit;
+
+      sticky |= fractional_bits[0] | fractional_bits[1] | fractional_bits[2] |
+                fractional_bits[3];
+      // Shift or divide the value by the radix
+      if (shift)
+        fractional_bits >>= shift;
+      else
+        fractional_bits /= radix;
+
+      //std::cout << "fdigit = " << digit << " " << fractional_bits.to_string()
+      //    << " " << sticky << "\n";
+    }
+
+    //std::cout << "Int =" << integer_bits.to_string() << " " <<
+    //    fractional_bits.to_string() << "\n";
+
+    fractional_bits[0] = fractional_bits[0] | sticky;
+
+    if (isNegative)
+      *this = -(integer_bits + fractional_bits);
+    else
+      *this = integer_bits + fractional_bits;
+
+    //std::cout << "end = " << this->to_string(16) << "\n";
+  }
+
+  /// report invalid constrction of ap_fixed_base
+  INLINE void report() {
+    if (!_AP_S && _AP_O == AP_WRAP_SM) {
+      fprintf(stderr, "ap_ufxied<...> cannot support AP_WRAP_SM.\n");
+      exit(1);
+    }
+    if (_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024) {
+      fprintf(stderr,
+              "[E] ap_%sfixed<%d, ...>: Bitwidth exceeds the "
+              "default max value %d. Please use macro "
+              "AP_INT_MAX_W to set a larger max value.\n",
+              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
+      exit(1);
+    }
+  }
+#else
+  INLINE void report() {}
+#endif // ifdef __SYNTHESIS__
+
+  /// @name helper functions.
+  //  @{
+  INLINE void overflow_adjust(bool underflow, bool overflow, bool lD,
+                              bool sign) {
+    if (!underflow && !overflow) return;
+    if (_AP_O == AP_WRAP) {
+      if (_AP_N == 0) return;
+      if (_AP_S) {
+        // signed AP_WRAP
+        // n_bits == 1
+        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
+        if (_AP_N > 1) {
+          // n_bits > 1
+          ap_int_base<_AP_W, false> mask(-1);
+          if (sign) mask.V = 0;
+          Base::V =
+              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
+        }
+      } else {
+        // unsigned AP_WRAP
+        ap_int_base<_AP_W, false> mask(-1);
+        Base::V =
+            _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 1, mask.V);
+      }
+    } else if (_AP_O == AP_SAT_ZERO) {
+      Base::V = 0;
+    } else if (_AP_O == AP_WRAP_SM && _AP_S) {
+      bool Ro = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      if (_AP_N == 0) {
+        if (lD != Ro) {
+          Base::V = ~Base::V;
+          Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, lD);
+        }
+      } else {
+        if (_AP_N == 1 && sign != Ro) {
+          Base::V = ~Base::V;
+        } else if (_AP_N > 1) {
+          bool lNo = _AP_ROOT_op_get_bit(Base::V, _AP_W - _AP_N);
+          if (lNo == sign) Base::V = ~Base::V;
+          ap_int_base<_AP_W, false> mask(-1);
+          if (sign) mask.V = 0;
+          Base::V =
+              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
+        }
+        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
+      }
+    } else {
+      if (_AP_S) {
+        if (overflow) {
+          Base::V = 1;
+          Base::V <<= _AP_W - 1;
+          Base::V = ~Base::V;
+        } else if (underflow) {
+          Base::V = 1;
+          Base::V <<= _AP_W - 1;
+          if (_AP_O == AP_SAT_SYM) Base::V |= 1;
+        }
+      } else {
+        if (overflow)
+          Base::V = ~(ap_int_base<_AP_W, false>(0).V);
+        else if (underflow)
+          Base::V = 0;
+      }
+    }
+  }
+
+  INLINE bool quantization_adjust(bool qb, bool r, bool s) {
+    bool carry = (bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+    if (_AP_Q == AP_TRN) return false;
+    if (_AP_Q == AP_RND_ZERO)
+      qb &= s || r;
+    else if (_AP_Q == AP_RND_MIN_INF)
+      qb &= r;
+    else if (_AP_Q == AP_RND_INF)
+      qb &= !s || r;
+    else if (_AP_Q == AP_RND_CONV)
+      qb &= _AP_ROOT_op_get_bit(Base::V, 0) || r;
+    else if (_AP_Q == AP_TRN_ZERO)
+      qb = s && (qb || r);
+    Base::V += qb;
+    return carry && (!(bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
+  }
+  //  @}
+
+ public:
+  /// @name constructors.
+  //  @{
+  /// default ctor.
+  INLINE ap_fixed_base() {}
+
+  /// copy ctor.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(op);
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(op);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
+    tmp.V = op.V;
+    operator=(tmp);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
+    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
+    tmp.V = op.V;
+    operator=(tmp);
+    report();
+  }
+
+#ifndef __SYNTHESIS__
+#ifndef NON_C99STRING
+  INLINE ap_fixed_base(const char* s, signed char rd = 0) {
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(s, radix); // will guess rd, default 10
+    _AP_ERROR(radix == 0, "ap_fixed_base(const char* \"%s\", %d), str=%s, radix = %d",
+              s, rd, str.c_str(), radix); // TODO remove this check
+    fromString(str, radix);
+  }
+#else
+  INLINE ap_fixed_base(const char* s, signed char rd = 10) {
+    ap_int_base<_AP_W, _AP_S> t(s, rd);
+    Base::V = t.V;
+  }
+#endif // ifndef NON_C99STRING
+#else // ifndef __SYNTHESIS__
+  // XXX _ssdm_string2bits only takes const string and const radix.
+  // It seems XFORM will do compile time processing of the string.
+  INLINE ap_fixed_base(const char* s) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_I, _AP_S, _AP_Q,
+                      _AP_O, _AP_N, _AP_C99);
+    Base::V = t;
+  }
+  INLINE ap_fixed_base(const char* s, signed char rd) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_I, _AP_S, _AP_Q,
+                      _AP_O, _AP_N, _AP_C99);
+    Base::V = t;
+  }
+#endif // ifndef __SYNTHESIS__ else
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    *this = ((bool)op);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_range_ref<_AP_W2, _AP_S2>& op) {
+    *this = (ap_int_base<_AP_W2, false>(op));
+    report();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_fixed_base(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) {
+    *this = (ap_int_base<_AP_W2 + _AP_W3, false>(op));
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    *this = (bool(op));
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    *this = (ap_int_base<_AP_W2, false>(op));
+    report();
+  }
+
+  // ctors from c types.
+  // make a temp ap_fixed_base first, and use ap_fixed_base.operator=
+#define CTOR_FROM_INT(C_TYPE, _AP_W2, _AP_S2)        \
+  INLINE ap_fixed_base(const C_TYPE x) {             \
+    ap_fixed_base<(_AP_W2), (_AP_W2), (_AP_S2)> tmp; \
+    tmp.V = x;                                       \
+    *this = tmp;                                     \
+  }
+
+  CTOR_FROM_INT(bool, 1, false)
+  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  CTOR_FROM_INT(signed char, 8, true)
+  CTOR_FROM_INT(unsigned char, 8, false)
+  CTOR_FROM_INT(short, _AP_SIZE_short, true)
+  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  CTOR_FROM_INT(int, _AP_SIZE_int, true)
+  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  CTOR_FROM_INT(long, _AP_SIZE_long, true)
+  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+#undef CTOR_FROM_INT
+/*
+ * TODO:
+ *Theere used to be several funtions which were AP_WEAK.
+ *Now they're all INLINE expect ap_fixed_base(double d)
+ *Maybe we can use '#pragma HLS inline' instead of INLINE.
+ */
+  AP_WEAK ap_fixed_base(double d) {
+    ap_int_base<64, false> ireg;
+    ireg.V = doubleToRawBits(d);
+    bool isneg = _AP_ROOT_op_get_bit(ireg.V, 63);
+
+    ap_int_base<DOUBLE_EXP + 1, true> exp;
+    ap_int_base<DOUBLE_EXP, false> exp_tmp;
+    exp_tmp.V =
+        _AP_ROOT_op_get_range(ireg.V, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1);
+    exp = exp_tmp - DOUBLE_BIAS;
+    ap_int_base<DOUBLE_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(ireg.V, 0, DOUBLE_MAN - 1);
+    // do not support NaN
+    _AP_WARNING(exp == APFX_IEEE_DOUBLE_E_MAX + 1 && man.V != 0,
+                "assign NaN to fixed point value");
+    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
+    if (isneg) man = -man;
+    if ((ireg.V & 0x7fffffffffffffffLL) == 0) {
+      Base::V = 0;
+    } else {
+      int _AP_W2 = DOUBLE_MAN + 2, _AP_I2 = exp.V + 2, _AP_F = _AP_W - _AP_I,
+          F2 = _AP_W2 - _AP_I2;
+      bool _AP_S2 = true,
+           QUAN_INC = F2 > _AP_F &&
+                      !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
+      bool carry = false;
+      // handle quantization
+      unsigned sh_amt = (F2 > _AP_F) ? F2 - _AP_F : _AP_F - F2;
+      if (F2 == _AP_F)
+        Base::V = man.V;
+      else if (F2 > _AP_F) {
+        if (sh_amt < DOUBLE_MAN + 2)
+          Base::V = man.V >> sh_amt;
+        else {
+          Base::V = isneg ? -1 : 0;
+        }
+        if ((_AP_Q != AP_TRN) && !((_AP_Q == AP_TRN_ZERO) && !_AP_S2)) {
+          bool qb = (F2 - _AP_F > _AP_W2) ? isneg : (bool)_AP_ROOT_op_get_bit(
+                                                        man.V, F2 - _AP_F - 1);
+          bool r =
+              (F2 > _AP_F + 1)
+                  ? _AP_ROOT_op_get_range(man.V, 0, (F2 - _AP_F - 2 < _AP_W2)
+                                                        ? (F2 - _AP_F - 2)
+                                                        : (_AP_W2 - 1)) != 0
+                  : false;
+          carry = quantization_adjust(qb, r, isneg);
+        }
+      } else { // no quantization
+        Base::V = man.V;
+        if (sh_amt < _AP_W)
+          Base::V = Base::V << sh_amt;
+        else
+          Base::V = 0;
+      }
+      // handle overflow/underflow
+      if ((_AP_O != AP_WRAP || _AP_N != 0) &&
+          ((!_AP_S && _AP_S2) ||
+           _AP_I - _AP_S <
+               _AP_I2 - _AP_S2 +
+                   (QUAN_INC ||
+                    (_AP_S2 && (_AP_O == AP_SAT_SYM))))) { // saturation
+        bool deleted_zeros = _AP_S2 ? true : !carry, deleted_ones = true;
+        bool neg_src = isneg;
+        bool lD = false;
+        int pos1 = F2 - _AP_F + _AP_W;
+        int pos2 = F2 - _AP_F + _AP_W + 1;
+        bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+        if (pos1 < _AP_W2 && pos1 >= 0)
+          // lD = _AP_ROOT_op_get_bit(man.V, pos1);
+          lD = (man.V >> pos1) & 1;
+        if (pos1 < _AP_W2) {
+          bool Range1_all_ones = true;
+          bool Range1_all_zeros = true;
+          bool Range2_all_ones = true;
+          ap_int_base<DOUBLE_MAN + 2, false> Range2;
+          ap_int_base<DOUBLE_MAN + 2, false> all_ones(-1);
+
+          if (pos2 >= 0 && pos2 < _AP_W2) {
+            // Range2.V = _AP_ROOT_op_get_range(man.V,
+            //                        pos2, _AP_W2 - 1);
+            Range2.V = man.V;
+            Range2.V >>= pos2;
+            Range2_all_ones = Range2 == (all_ones >> pos2);
+          } else if (pos2 < 0)
+            Range2_all_ones = false;
+          if (pos1 >= 0 && pos2 < _AP_W2) {
+            Range1_all_ones = Range2_all_ones && lD;
+            Range1_all_zeros = !Range2.V && !lD;
+          } else if (pos2 == _AP_W2) {
+            Range1_all_ones = lD;
+            Range1_all_zeros = !lD;
+          } else if (pos1 < 0) {
+            Range1_all_zeros = !man.V;
+            Range1_all_ones = false;
+          }
+
+          deleted_zeros =
+              deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
+          deleted_ones =
+              carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
+          neg_src = isneg && !(carry && Range1_all_ones);
+        } else
+          neg_src = isneg && newsignbit;
+        bool neg_trg = _AP_S && newsignbit;
+        bool overflow = (neg_trg || !deleted_zeros) && !isneg;
+        bool underflow = (!neg_trg || !deleted_ones) && neg_src;
+        if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
+          underflow |=
+              neg_src &&
+              (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
+                         : true);
+        overflow_adjust(underflow, overflow, lD, neg_src);
+      }
+    }
+    report();
+  }
+
+  // TODO more optimized implementation.
+  INLINE ap_fixed_base(float d) { *this = ap_fixed_base(double(d)); }
+
+#if _AP_ENABLE_HALF_ == 1
+  // TODO more optimized implementation.
+  INLINE ap_fixed_base(half d) { *this = ap_fixed_base(double(d)); }
+#endif
+  //  @}
+
+  /// @name assign operator
+  /// assign, using another ap_fixed_base of same template parameters.
+  /*
+  INLINE ap_fixed_base& operator=(
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+  */
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+
+    const int _AP_F = _AP_W - _AP_I;
+    const int F2 = _AP_W2 - _AP_I2;
+    const int QUAN_INC =
+          F2 > _AP_F && !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
+
+    if (!op) Base::V = 0;
+    bool carry = false;
+    bool signbit = _AP_ROOT_op_get_bit(op.V, _AP_W2 - 1);
+    bool isneg = signbit && _AP_S2;
+    if (F2 == _AP_F)
+      Base::V = op.V;
+    else if (F2 > _AP_F) {
+      unsigned int sh_amt = F2 - _AP_F;
+      //  moves bits right, handle quantization.
+      if (sh_amt < _AP_W2) {
+        Base::V = op.V >> sh_amt;
+      } else {
+        Base::V = isneg ? -1 : 0;
+      }
+      if (_AP_Q != AP_TRN && !(_AP_Q == AP_TRN_ZERO && !_AP_S2)) {
+        bool qbit = _AP_ROOT_op_get_bit(op.V, F2 - _AP_F - 1);
+        // bit after LSB.
+        bool qb = (F2 - _AP_F > _AP_W2) ? _AP_S2 && signbit : qbit;
+        enum { hi = ((F2 - _AP_F - 2) < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1) };
+        // bits after qb.
+        bool r = (F2 > _AP_F + 1) ? (_AP_ROOT_op_get_range(op.V, 0, hi) != 0) : false;
+        carry = quantization_adjust(qb, r, isneg);
+      }
+    } else {
+      unsigned  sh_amt = _AP_F - F2;
+      // moves bits left, no quantization
+      if (sh_amt < _AP_W) {
+        if (_AP_W > _AP_W2) {
+          // extend and then shift, avoid losing bits.
+          Base::V = op.V;
+          Base::V <<= sh_amt;
+        } else {
+          // shift and truncate.
+          Base::V = op.V << sh_amt;
+        }
+      } else {
+        Base::V = 0;
+      }
+    }
+    // handle overflow/underflow
+    if ((_AP_O != AP_WRAP || _AP_N != 0) &&
+        ((!_AP_S && _AP_S2) ||
+         _AP_I - _AP_S <
+             _AP_I2 - _AP_S2 +
+                 (QUAN_INC || (_AP_S2 && _AP_O == AP_SAT_SYM)))) { // saturation
+      bool deleted_zeros = _AP_S2 ? true : !carry;
+      bool deleted_ones = true;
+      bool neg_src = isneg;
+      bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      enum { pos1 = F2 - _AP_F + _AP_W, pos2 = F2 - _AP_F + _AP_W + 1 };
+      bool lD = (pos1 < _AP_W2 && pos1 >= 0) ? _AP_ROOT_op_get_bit(op.V, pos1)
+                                             : false;
+      if (pos1 < _AP_W2) {
+        bool Range1_all_ones = true;
+        bool Range1_all_zeros = true;
+        bool Range2_all_ones = true;
+        ap_int_base<_AP_W2, false> all_ones(-1);
+
+        if (pos2 < _AP_W2 && pos2 >= 0) {
+          ap_int_base<_AP_W2, false> Range2;
+          Range2.V = _AP_ROOT_op_get_range(op.V, pos2, _AP_W2 - 1);
+          Range2_all_ones = Range2 == (all_ones >> pos2);
+        } else if (pos2 < 0) {
+          Range2_all_ones = false;
+        }
+
+        if (pos1 >= 0 && pos2 < _AP_W2) {
+          ap_int_base<_AP_W2, false> Range1;
+          Range1.V = _AP_ROOT_op_get_range(op.V, pos1, _AP_W2 - 1);
+          Range1_all_ones = Range1 == (all_ones >> pos1);
+          Range1_all_zeros = !Range1.V;
+        } else if (pos2 == _AP_W2) {
+          Range1_all_ones = lD;
+          Range1_all_zeros = !lD;
+        } else if (pos1 < 0) {
+          Range1_all_zeros = !op.V;
+          Range1_all_ones = false;
+        }
+
+        deleted_zeros =
+            deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
+        deleted_ones =
+            carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
+        neg_src = isneg && !(carry && Range1_all_ones);
+      } else
+        neg_src = isneg && newsignbit;
+      bool neg_trg = _AP_S && newsignbit;
+      bool overflow = (neg_trg || !deleted_zeros) && !isneg;
+      bool underflow = (!neg_trg || !deleted_ones) && neg_src;
+      if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
+        underflow |=
+            neg_src &&
+            (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
+                       : true);
+
+      overflow_adjust(underflow, overflow, lD, neg_src);
+    }
+    return *this;
+  } // operator= 
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator=(
+      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(const_cast<const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(op));
+    return *this;
+  }
+
+  /// Set this ap_fixed_base with ULL.
+  INLINE ap_fixed_base& setBits(ap_ulong bv) {
+    // TODO when ull is not be long enough...
+    Base::V = bv;
+    return *this;
+  }
+
+  /// Return a ap_fixed_base object whose this->V is assigned by bv.
+  static INLINE ap_fixed_base bitsToFixed(ap_ulong bv) {
+    // TODO fix when ull is not be long enough...
+    ap_fixed_base t;
+#ifdef __SYNTHESIS__
+    t.V = bv;
+#else
+    t.V.set_bits(bv);
+#endif
+    return t;
+  }
+
+  // Explicit conversion functions to ap_int_base.
+  /** Captures all integer bits, in truncate mode.
+   *  @param[in] Cnative follow conversion from double to int.
+   */
+  INLINE ap_int_base<AP_MAX(_AP_I, 1), _AP_S> to_ap_int_base(
+      bool Cnative = true) const {
+    ap_int_base<AP_MAX(_AP_I, 1), _AP_S> ret;
+    if (_AP_I == 0) {
+      ret.V = 0;
+    } else if (_AP_I > 0 && _AP_I <= _AP_W) {
+      ret.V = _AP_ROOT_op_get_range(Base::V, _AP_W - _AP_I, _AP_W - 1);
+    } else if (_AP_I > _AP_W) {
+      ret.V = _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 1);
+      ret.V <<= (_AP_I - _AP_W);
+    }
+    /* Consider the following case
+     *   float f = -7.5f;
+     *   ap_fixed<8,4> t = f;  // -8 0 0 0 . 0.5
+     *   int i = t.to_int();
+     * the result should be -7 instead of -8.
+     * Therefore, after truncation, the value should be increated by 1.
+     * For (-1, 0), carry to MSB will happen, but result 0 is still correct.
+     */
+    if (Cnative && _AP_I < _AP_W) {
+      // Follow C native data type, conversion from double to int
+      if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1) && (_AP_I < _AP_W) &&
+          (_AP_ROOT_op_get_range(
+               Base::V, 0, _AP_I < 0 ? _AP_W - 1 : _AP_W - _AP_I - 1) != 0))
+        ++ret;
+    } else {
+      // Follow OSCI library, conversion from sc_fixed to sc_int
+    }
+    return ret;
+  };
+
+ public:
+  template <int _AP_W2, bool _AP_S2>
+  INLINE operator ap_int_base<_AP_W2, _AP_S2>() const {
+    return ap_int_base<_AP_W2, _AP_S2>(to_ap_int_base());
+  }
+
+  // Explicit conversion function to C built-in integral type.
+  INLINE char to_char() const { return to_ap_int_base().to_char(); }
+
+  INLINE int to_int() const { return to_ap_int_base().to_int(); }
+
+  INLINE unsigned to_uint() const { return to_ap_int_base().to_uint(); }
+
+  INLINE ap_slong to_int64() const { return to_ap_int_base().to_int64(); }
+
+  INLINE ap_ulong to_uint64() const { return to_ap_int_base().to_uint64(); }
+
+  /// covert function to double.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE double to_double() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = DOUBLE_MAN + DOUBLE_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros(); ///< number of leading zeros.
+    int e = _AP_I - l - 1 + DOUBLE_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - DOUBLE_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // ull is at least 64-bit
+    ap_ulong m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (ap_ulong)(tmp.V >> (lsb_index - 1))
+                           : (ap_ulong)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (ap_ulong)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    //std::cout << '\n' << std::hex << m << '\n'; // TODO delete this
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, DOUBLE_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    //std::cout << m << '\n'; // TODO delete this
+    m = _AP_ctype_op_set_range(m, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1, e);
+    //std::cout << std::hex << m << std::dec << std::endl; // TODO delete this
+    // cast to fp
+    return rawBitsToDouble(m);
+  }
+
+  /// convert function to float.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE float to_float() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = FLOAT_MAN + FLOAT_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
+    int e = _AP_I - l - 1 + FLOAT_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - FLOAT_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // ul is at least 32-bit
+    unsigned long m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (unsigned long)(tmp.V >> (lsb_index - 1))
+                           : (unsigned long)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (unsigned long)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, FLOAT_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    m = _AP_ctype_op_set_range(m, FLOAT_MAN, FLOAT_MAN + FLOAT_EXP - 1, e);
+    // cast to fp
+    return rawBitsToFloat(m);
+  }
+
+#if _AP_ENABLE_HALF_ == 1
+  /// convert function to half.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE half to_half() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = HALF_MAN + HALF_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
+    int e = _AP_I - l - 1 + HALF_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - HALF_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // short is at least 16-bit
+    unsigned short m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (unsigned short)(tmp.V >> (lsb_index - 1))
+                           : (unsigned short)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (unsigned short)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, HALF_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    m = _AP_ctype_op_set_range(m, HALF_MAN, HALF_MAN + HALF_EXP - 1, e);
+    // cast to fp
+    return rawBitsToHalf(m);
+  }
+#endif
+
+  // FIXME inherited from old code, this may loose precision!
+  INLINE operator long double() const { return (long double)to_double(); }
+
+  INLINE operator double() const { return to_double(); }
+
+  INLINE operator float() const { return to_float(); }
+
+#if _AP_ENABLE_HALF_ == 1
+  INLINE operator half() const { return to_half(); }
+#endif
+
+  INLINE operator bool() const { return (bool)Base::V != 0; }
+
+  INLINE operator char() const { return (char)to_int(); }
+
+  INLINE operator signed char() const { return (signed char)to_int(); }
+
+  INLINE operator unsigned char() const { return (unsigned char)to_uint(); }
+
+  INLINE operator short() const { return (short)to_int(); }
+
+  INLINE operator unsigned short() const { return (unsigned short)to_uint(); }
+
+  INLINE operator int() const { return to_int(); }
+
+  INLINE operator unsigned int() const { return to_uint(); }
+
+// FIXME don't assume data width...
+#ifdef __x86_64__
+  INLINE operator long() const { return (long)to_int64(); }
+
+  INLINE operator unsigned long() const { return (unsigned long)to_uint64(); }
+#else
+  INLINE operator long() const { return (long)to_int(); }
+
+  INLINE operator unsigned long() const { return (unsigned long)to_uint(); }
+#endif // ifdef __x86_64__ else
+
+  INLINE operator ap_ulong() const { return to_uint64(); }
+
+  INLINE operator ap_slong() const { return to_int64(); }
+
+  INLINE int length() const { return _AP_W; };
+
+  // bits_to_int64 deleted.
+#ifndef __SYNTHESIS__
+  // Used in autowrap, when _AP_W < 64.
+  INLINE ap_ulong bits_to_uint64() const {
+    return (Base::V).to_uint64();
+  }
+#endif
+
+  // Count the number of zeros from the most significant bit
+  // to the first one bit. Note this is only for ap_fixed_base whose
+  // _AP_W <= 64, otherwise will incur assertion.
+  INLINE int countLeadingZeros() {
+#ifdef __SYNTHESIS__
+    // TODO: used llvm.ctlz intrinsic ?
+    if (_AP_W <= 32) {
+      ap_int_base<32, false> t(-1ULL);
+      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
+      return __builtin_ctz(t.V);
+    } else if (_AP_W <= 64) {
+      ap_int_base<64, false> t(-1ULL);
+      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
+      return __builtin_ctzll(t.V);
+    } else {
+      enum {__N = (_AP_W + 63) / 64};
+      int NZeros = 0;
+      int i = 0;
+      bool hitNonZero = false;
+      for (i = 0; i < __N - 1; ++i) {
+        ap_int_base<64, false> t;
+        t.range(0, 63) = this->range(_AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
+        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V);
+        hitNonZero |= (t != 0);
+      }
+      if (!hitNonZero) {
+        ap_int_base<64, false> t(-1ULL);
+        t.range(63 - (_AP_W - 1) % 64, 63) = this->range(0, (_AP_W - 1) % 64);
+        NZeros += __builtin_clzll(t.V);
+      }
+      return NZeros;
+    }
+#else
+    return Base::V.countLeadingZeros();
+#endif
+  }
+
+  // Arithmetic : Binary
+  // -------------------------------------------------------------------------
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::mult operator*(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
+      const {
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::mult_base r, t;
+    r.V = Base::V;
+    t.V = op2.V;
+    r.V *= op2.V;
+    return r;
+  }
+
+  // multiply function deleted.
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::div operator/(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
+      const {
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::div_base r;
+#ifndef __SYNTHESIS__
+    enum {F2 = _AP_W2-_AP_I2,
+              _W1=AP_MAX(_AP_W + AP_MAX(F2, 0) + ((_AP_S2 && !_AP_S) ? 1 : 0), _AP_W2 + ((_AP_S && !_AP_S2) ? 1 : 0))};
+    ap_int_base<_W1,_AP_S||_AP_S2> dividend,divisior;
+    ap_int_base<_W1,_AP_S> tmp1;
+    ap_int_base<_W1,_AP_S2> tmp2;
+    tmp1.V = Base::V;
+    tmp1.V <<= AP_MAX(F2,0);
+    tmp2.V = op2.V;
+    dividend = tmp1;
+    divisior = tmp2;
+    r.V = ((_AP_S||_AP_S2) ? dividend.V.sdiv(divisior.V): dividend.V.udiv(divisior.V));
+#else
+    #ifndef __SC_COMPATIBLE__
+        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0),_AP_I, _AP_S> t(*this);
+    #else
+        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0) + AP_MAX(_AP_I2, 0),_AP_I, _AP_S> t(*this);
+    #endif
+        r.V = t.V / op2.V;
+#endif
+/*
+    enum {
+      F2 = _AP_W2 - _AP_I2,
+      shl = AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
+#ifndef __SC_COMPATIBLE__
+      shr = AP_MAX(_AP_I2, 0),
+#else
+      shr = 0,
+#endif
+      W3 = _AP_S2 + _AP_W + shl,
+      S3 = _AP_S || _AP_S2,
+    };
+    ap_int_base<W3, S3> dividend, t;
+    dividend.V = Base::V;
+    // multiply both by (1 << F2), and than do integer division.
+    dividend.V <<= (int) shl;
+#ifdef __SYNTHESIS__
+    // .V's have right signedness, and will have right extending.
+    t.V = dividend.V / op2.V;
+#else
+    // XXX op2 may be wider than dividend, and sdiv and udiv takes the same with
+    // as left hand operand, so data might be truncated by mistake if not
+    // handled here.
+    t.V = S3 ? dividend.V.sdiv(op2.V) : dividend.V.udiv(op2.V);
+#endif
+    r.V = t.V >> (int) shr;
+*/
+    return r;
+  }
+
+#define OP_BIN_AF(Sym, Rty)                                                \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
+            ap_o_mode _AP_O2, int _AP_N2>                                  \
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty operator Sym(         \
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
+          op2) const {                                                     \
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty##_base ret, lhs(*this),    \
+        rhs(op2);                                                          \
+    ret.V = lhs.V Sym rhs.V;                                               \
+    return ret;                                                            \
+  }
+
+  OP_BIN_AF(+, plus)
+  OP_BIN_AF(-, minus)
+  OP_BIN_AF(&, logic)
+  OP_BIN_AF(|, logic)
+  OP_BIN_AF(^, logic)
+
+// Arithmetic : assign
+// -------------------------------------------------------------------------
+#define OP_ASSIGN_AF(Sym)                                                  \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
+            ap_o_mode _AP_O2, int _AP_N2>                                  \
+  INLINE ap_fixed_base& operator Sym##=(                                   \
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
+          op2) {                                                           \
+    *this = operator Sym(op2);                                             \
+    return *this;                                                          \
+  }
+
+  OP_ASSIGN_AF(*)
+  OP_ASSIGN_AF(/)
+  OP_ASSIGN_AF(+)
+  OP_ASSIGN_AF(-)
+  OP_ASSIGN_AF(&)
+  OP_ASSIGN_AF(|)
+  OP_ASSIGN_AF(^)
+
+  // Prefix and postfix increment and decrement.
+  // -------------------------------------------------------------------------
+
+  /// Prefix increment
+  INLINE ap_fixed_base& operator++() {
+    operator+=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
+    return *this;
+  }
+
+  /// Prefix decrement.
+  INLINE ap_fixed_base& operator--() {
+    operator-=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
+    return *this;
+  }
+
+  /// Postfix increment
+  INLINE const ap_fixed_base operator++(int) {
+    ap_fixed_base r(*this);
+    operator++();
+    return r;
+  }
+
+  /// Postfix decrement
+  INLINE const ap_fixed_base operator--(int) {
+    ap_fixed_base r(*this);
+    operator--();
+    return r;
+  }
+
+  // Unary arithmetic.
+  // -------------------------------------------------------------------------
+  INLINE ap_fixed_base operator+() { return *this; }
+
+  INLINE ap_fixed_base<_AP_W + 1, _AP_I + 1, true> operator-() const {
+    ap_fixed_base<_AP_W + 1, _AP_I + 1, true> r(*this);
+    r.V = -r.V;
+    return r;
+  }
+
+  INLINE ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> getNeg() {
+    ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> r(*this);
+    r.V = -r.V;
+    return r;
+  }
+
+  // Not (!)
+  // -------------------------------------------------------------------------
+  INLINE bool operator!() const { return Base::V == 0; }
+
+  // Bitwise complement
+  // -------------------------------------------------------------------------
+  // XXX different from Mentor's ac_fixed.
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S> operator~() const {
+    ap_fixed_base<_AP_W, _AP_I, _AP_S> r;
+    r.V = ~Base::V;
+    return r;
+  }
+
+  // Shift
+  // -------------------------------------------------------------------------
+  // left shift is the same as moving point right, i.e. increate I.
+  template <int _AP_SHIFT>
+  INLINE ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> lshift() const {
+    ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> r;
+    r.V = Base::V;
+    return r;
+  }
+
+  template <int _AP_SHIFT>
+  INLINE ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> rshift() const {
+    ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> r;
+    r.V = Base::V;
+    return r;
+  }
+
+  // Because the return type is the type of the the first operand, shift assign
+  // operators do not carry out any quantization or overflow
+  // While systemc, shift assigns for sc_fixed/sc_ufixed will result in
+  // quantization or overflow (depending on the mode of the first operand)
+  INLINE ap_fixed_base operator<<(unsigned int sh) const {
+    ap_fixed_base r;
+    r.V = Base::V << sh;
+// TODO check shift overflow?
+#ifdef __SC_COMPATIBLE__
+    if (sh == 0) return r;
+    if (_AP_O != AP_WRAP || _AP_N != 0) {
+      bool neg_src = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      bool allones, allzeros;
+      ap_int_base<_AP_W, false> ones(-1);
+      if (sh <= _AP_W) {
+        ap_int_base<_AP_W, false> range1;
+        range1.V = _AP_ROOT_op_get_range(
+            const_cast<ap_fixed_base*>(this)->Base::V, _AP_W - sh, _AP_W - 1);
+        allones = range1 == (ones >> (_AP_W - sh));
+        allzeros = range1 == 0;
+      } else {
+        allones = false;
+        allzeros = Base::V == 0;
+      }
+      bool overflow = !allzeros && !neg_src;
+      bool underflow = !allones && neg_src;
+      if ((_AP_O == AP_SAT_SYM) && _AP_S)
+        underflow |=
+            neg_src &&
+            (_AP_W > 1 ? _AP_ROOT_op_get_range(r.V, 0, _AP_W - 2) == 0 : true);
+      bool lD = false;
+      if (sh < _AP_W) lD = _AP_ROOT_op_get_bit(Base::V, _AP_W - sh - 1);
+      r.overflow_adjust(underflow, overflow, lD, neg_src);
+    }
+#endif
+    return r;
+  }
+
+  INLINE ap_fixed_base operator>>(unsigned int sh) const {
+    ap_fixed_base r;
+    r.V = Base::V >> sh;
+// TODO check shift overflow?
+#ifdef __SC_COMPATIBLE__
+    if (sh == 0) return r;
+    if (_AP_Q != AP_TRN) {
+      bool qb = false;
+      if (sh <= _AP_W) qb = _AP_ROOT_op_get_bit(Base::V, sh - 1);
+      bool rb = false;
+      if (sh > 1 && sh <= _AP_W)
+        rb = _AP_ROOT_op_get_range(const_cast<ap_fixed_base*>(this)->Base::V, 0,
+                                   sh - 2) != 0;
+      else if (sh > _AP_W)
+        rb = Base::V != 0;
+      r.quantization_adjust(qb, rb,
+                            _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
+    }
+#endif
+    return r;
+  }
+
+  // left and right shift for int
+  INLINE ap_fixed_base operator<<(int sh) const {
+    ap_fixed_base r;
+    bool isNeg = sh < 0;
+    unsigned int ush = isNeg ? -sh : sh;
+    if (isNeg) {
+      return operator>>(ush);
+    } else {
+      return operator<<(ush);
+    }
+  }
+
+  INLINE ap_fixed_base operator>>(int sh) const {
+    bool isNeg = sh < 0;
+    unsigned int ush = isNeg ? -sh : sh;
+    if (isNeg) {
+      return operator<<(ush);
+    } else {
+      return operator>>(ush);
+    }
+  }
+
+  // left and right shift for ap_int.
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, true>& op2) const {
+    // TODO the code seems not optimal. ap_fixed<8,8> << ap_int<2> needs only a
+    // small mux, but integer need a big one!
+    int sh = op2.to_int();
+    return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, true>& op2) const {
+    int sh = op2.to_int();
+    return operator>>(sh);
+  }
+
+  // left and right shift for ap_uint.
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, false>& op2) const {
+    unsigned int sh = op2.to_uint();
+    return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, false>& op2) const {
+    unsigned int sh = op2.to_uint();
+    return operator>>(sh);
+  }
+
+  // left and right shift for ap_fixed
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base operator<<(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          op2) {
+    return operator<<(op2.to_ap_int_base());
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base operator>>(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          op2) {
+    return operator>>(op2.to_ap_int_base());
+  }
+
+  // Shift assign.
+  // -------------------------------------------------------------------------
+
+  // left shift assign.
+  INLINE ap_fixed_base& operator<<=(const int sh) {
+    *this = operator<<(sh);
+    return *this;
+  }
+
+  INLINE ap_fixed_base& operator<<=(const unsigned int sh) {
+    *this = operator<<(sh);
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base& operator<<=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
+    *this = operator<<(sh.to_int());
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator<<=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          sh) {
+    *this = operator<<(sh.to_int());
+    return *this;
+  }
+
+  // right shift assign.
+  INLINE ap_fixed_base& operator>>=(const int sh) {
+    *this = operator>>(sh);
+    return *this;
+  }
+
+  INLINE ap_fixed_base& operator>>=(const unsigned int sh) {
+    *this = operator>>(sh);
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base& operator>>=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
+    *this = operator>>(sh.to_int());
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator>>=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          sh) {
+    *this = operator>>(sh.to_int());
+    return *this;
+  }
+
+// Comparisons.
+// -------------------------------------------------------------------------
+#define OP_CMP_AF(Sym)                                                         \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,             \
+            ap_o_mode _AP_O2, int _AP_N2>                                      \
+  INLINE bool operator Sym(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, \
+                                               _AP_O2, _AP_N2>& op2) const {   \
+    enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 };                      \
+    if (_AP_F == F2)                                                           \
+      return Base::V Sym op2.V;                                                \
+    else if (_AP_F > F2)                                                       \
+      return Base::V Sym ap_fixed_base<AP_MAX(_AP_W2 + _AP_F - F2, 1), _AP_I2, \
+                                       _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; \
+    else                                                                       \
+      return ap_fixed_base<AP_MAX(_AP_W + F2 - _AP_F + 1, 1), _AP_I + 1,       \
+                           _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V Sym op2.V;     \
+    return false;                                                              \
+  }
+
+  OP_CMP_AF(>)
+  OP_CMP_AF(<)
+  OP_CMP_AF(>=)
+  OP_CMP_AF(<=)
+  OP_CMP_AF(==)
+  OP_CMP_AF(!=)
+// FIXME: Move compare with double out of struct ap_fixed_base defination
+//        and combine it with compare operator(double, ap_fixed_base)
+#define DOUBLE_CMP_AF(Sym) \
+  INLINE bool operator Sym(double d) const { return to_double() Sym d; }
+
+  DOUBLE_CMP_AF(>)
+  DOUBLE_CMP_AF(<)
+  DOUBLE_CMP_AF(>=)
+  DOUBLE_CMP_AF(<=)
+  DOUBLE_CMP_AF(==)
+  DOUBLE_CMP_AF(!=)
+
+  // Bit and Slice Select
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
+      unsigned index) {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
+                                                                index.to_int());
+  }
+
+  INLINE bool operator[](unsigned index) const {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
+  }
+
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
+      unsigned index) {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
+                                                                index.to_int());
+  }
+
+  INLINE bool bit(unsigned index) const {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
+  }
+
+  template <int _AP_W2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
+      const ap_int_base<_AP_W2, true>& index) {
+    _AP_WARNING(index < _AP_I - _AP_W,
+                "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        this, index.to_int() + _AP_W - _AP_I);
+  }
+
+  INLINE bool get_bit(int index) const {
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
+                               index + _AP_W - _AP_I);
+  }
+#if 0
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
+      int index) {
+    _AP_WARNING(index < _AP_I - _AP_W,
+              "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        this, index + _AP_W - _AP_I);
+  }
+#endif
+
+  template <int _AP_W2>
+  INLINE bool get_bit(const ap_int_base<_AP_W2, true>& index) const {
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
+                               index.to_int() + _AP_W - _AP_I);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(int Hi,
+                                                                      int Lo) {
+    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
+    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, Hi, Lo);
+  }
+
+  // This is a must to strip constness to produce reference type.
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      int Hi, int Lo) const {
+    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
+    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        const_cast<ap_fixed_base*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() const {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      int Hi, int Lo) {
+    return this->range(Hi, Lo);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      int Hi, int Lo) const {
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE bool is_zero() const { return Base::V == 0; }
+
+  INLINE bool is_neg() const {
+    if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)) return true;
+    return false;
+  }
+
+  INLINE int wl() const { return _AP_W; }
+
+  INLINE int iwl() const { return _AP_I; }
+
+  INLINE ap_q_mode q_mode() const { return _AP_Q; }
+
+  INLINE ap_o_mode o_mode() const { return _AP_O; }
+
+  INLINE int n_bits() const { return _AP_N; }
+
+  // print a string representation of this number in the given radix.
+  // Radix support is 2, 8, 10, or 16.
+  // The result will include a prefix indicating the radix, except for decimal,
+  // where no prefix is needed.  The default is to output a signed representation
+  // of signed numbers, or an unsigned representation  of unsigned numbers.  For
+  // non-decimal formats, this can be changed by the 'sign' argument.
+#ifndef __SYNTHESIS__
+  std::string to_string(unsigned char radix = 2, bool sign = _AP_S) const {
+    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
+    // initialize sc_lv, which seems incapable of handling format "-0b".
+    if (radix == 2) sign = false;
+
+    std::string str;
+    str.clear();
+    char step = 0;
+    bool isNeg = sign && (Base::V < 0);
+
+    // Extend to take care of the -MAX case.
+    ap_fixed_base<_AP_W + 1, _AP_I + 1> tmp(*this);
+    if (isNeg) {
+      tmp = -tmp;
+      str += '-';
+    }
+    std::string prefix;
+    switch (radix) {
+      case 2:
+        prefix = "0b";
+        step = 1;
+        break;
+      case 8:
+        prefix = "0o";
+        step = 3;
+        break;
+      case 16:
+        prefix = "0x";
+        step = 4;
+        break;
+      default:
+        break;
+    }
+
+    if (_AP_I > 0) {
+      // Note we drop the quantization and rounding flags here.  The
+      // integer part is always in range, and the fractional part we
+      // want to drop.  Also, the number is always positive, because
+      // of the absolute value above.
+      ap_int_base<AP_MAX(_AP_I + 1, 1), false> int_part;
+      //   [1] [ I ] d [ W - I ]
+      //    |     |            |
+      //    |    W-I           0
+      //    W
+      int_part.V = _AP_ROOT_op_get_range(
+          tmp.V, _AP_W - _AP_I, _AP_W);
+      str += int_part.to_string(radix, false);
+    } else {
+      str += prefix;
+      str += '0';
+    }
+
+    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 1), 0, false> frac_part = tmp;
+
+    if (radix == 10) {
+      if (frac_part != 0) {
+        str += ".";
+        while (frac_part != 0) {
+          char digit = (frac_part * radix).to_char();
+          str += static_cast<char>(digit + '0');
+          frac_part *= radix;
+        }
+      }
+    } else {
+      if (frac_part != 0) {
+        str += ".";
+        for (signed i = _AP_W - _AP_I - 1; i >= 0; i -= step) {
+          char digit = frac_part.range(i, AP_MAX(0, i - step + 1)).to_char();
+          // If we have a partial bit pattern at the end, then we need
+          // to put it in the high-order bits of 'digit'.
+          int offset = AP_MIN(0, i - step + 1);
+          digit <<= -offset;
+          str += digit < 10 ? static_cast<char>(digit + '0')
+                            : static_cast<char>(digit - 10 + 'a');
+        }
+        if (radix == 16)
+          str += "p0"; // C99 Hex constants are required to have an exponent.
+      }
+    }
+    return str;
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(unsigned char radix = 2, bool sign = _AP_S) const {
+    return 0;
+  }
+#endif
+}; // struct ap_fixed_base.
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_not(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
+  ret.V = ~op.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_and(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V & op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_or(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V | op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_xor(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V ^ op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void neg(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+  ap_fixed_base<_AP_W2 + !_AP_S2, _AP_I2 + !_AP_S2, true, _AP_Q2, _AP_O2,
+                _AP_N2>
+      t;
+  t.V = -op.V;
+  ret = t;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void lshift(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
+    int i) {
+  enum {
+    F2 = _AP_W2 - _AP_I2,
+    _AP_I3 = AP_MAX(_AP_I, _AP_I2),
+    _AP_W3 = _AP_I3 + F2,
+  };
+  // wide buffer
+  ap_fixed_base<_AP_W3, _AP_I3, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
+  t.V = op.V;
+  t.V <<= i; // FIXME overflow?
+  // handle quantization and overflow
+  ret = t;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void rshift(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
+    int i) {
+  enum {
+    F = _AP_W - _AP_I,
+    F2 = _AP_W2 - _AP_I2,
+    F3 = AP_MAX(F, F2),
+    _AP_W3 = _AP_I2 + F3,
+    sh = F - F2,
+  };
+  // wide buffer
+  ap_fixed_base<_AP_W3, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
+  t.V = op.V;
+  if (sh >= 0)
+    t.V <<= (int) sh;
+  t.V >>= i;
+  // handle quantization and overflow
+  ret = t;
+}
+
+//// FIXME
+//// These partial specialization ctors allow code like
+////   char c = 'a';
+////   ap_fixed_base<8, 8, true> x(c);
+//// but what bout ap_fixed_base<9, 9, true> y(c) ?
+//
+
+#ifndef __SYNTHESIS__
+INLINE std::string scientificFormat(std::string& input) {
+  if (input.length() == 0) return input;
+
+  size_t decPosition = input.find('.');
+  if (decPosition == std::string::npos) decPosition = input.length();
+
+  size_t firstNonZeroPos = 0;
+  for (; input[firstNonZeroPos] > '9' || input[firstNonZeroPos] < '1';
+       firstNonZeroPos++)
+    ;
+
+  int exp;
+  if (firstNonZeroPos > decPosition)
+    exp = decPosition - firstNonZeroPos;
+  else
+    exp = decPosition - firstNonZeroPos - 1;
+  std::string expString = "";
+  if (exp == 0)
+    ;
+  else if (exp < 0) {
+    expString += "e-";
+    exp = -exp;
+  } else
+    expString += "e+";
+
+  if (exp < 10 && exp > 0) {
+    expString += '0';
+    expString += (char)('0' + exp);
+  } else if (exp != 0) {
+    std::string tmp;
+
+    std::ostringstream oss;
+    oss << exp;
+
+    tmp = oss.str();
+    expString += tmp;
+  }
+
+  int lastNonZeroPos = (int)(input.length() - 1);
+  for (; lastNonZeroPos >= 0; --lastNonZeroPos)
+    if (input[lastNonZeroPos] <= '9' && input[lastNonZeroPos] > '0') break;
+
+  std::string ans = "";
+  ans += input[firstNonZeroPos];
+  if (firstNonZeroPos != (size_t)lastNonZeroPos) {
+    ans += '.';
+    for (int i = firstNonZeroPos + 1; i <= lastNonZeroPos; i++)
+      if (input[i] != '.') ans += input[i];
+  }
+
+  ans += expString;
+  return ans;
+}
+
+INLINE std::string reduceToPrecision(std::string& input, int precision) {
+  bool isZero = true;
+  size_t inputLen = input.length();
+  for (size_t i = 0; i < inputLen && isZero; i++)
+    if (input[i] != '.' && input[i] != '0') isZero = false;
+  if (isZero) return "0";
+
+  // Find the first valid number, skip '-'
+  int FirstNonZeroPos = 0;
+  int LastNonZeroPos = (int)inputLen - 1;
+  int truncBitPosition = 0;
+  size_t decPosition = input.find('.');
+  for (; input[FirstNonZeroPos] < '1' || input[FirstNonZeroPos] > '9';
+       FirstNonZeroPos++)
+    ;
+
+  for (; input[LastNonZeroPos] < '1' || input[LastNonZeroPos] > '9';
+       LastNonZeroPos--)
+    ;
+
+  if (decPosition == std::string::npos) decPosition = inputLen;
+  // Count the valid number, to decide whether we need to truncate
+  if ((int)decPosition > LastNonZeroPos) {
+    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) return input;
+    truncBitPosition = FirstNonZeroPos + precision;
+  } else if ((int)decPosition < FirstNonZeroPos) { // This is pure decimal
+    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) {
+      if (FirstNonZeroPos - decPosition - 1 < 4) {
+        return input;
+      } else {
+        if (input[0] == '-') {
+          std::string tmp = input.substr(1, inputLen - 1);
+          return std::string("-") + scientificFormat(tmp);
+        } else
+          return scientificFormat(input);
+      }
+    }
+    truncBitPosition = FirstNonZeroPos + precision;
+  } else {
+    if (LastNonZeroPos - FirstNonZeroPos <= precision) return input;
+    truncBitPosition = FirstNonZeroPos + precision + 1;
+  }
+
+  // duplicate the input string, we want to add "0" before the valid numbers
+  // This is easy for quantization, since we may change 9999 to 10000
+  std::string ans = "";
+  std::string dupInput = "0";
+  if (input[0] == '-') {
+    ans += '-';
+    dupInput += input.substr(1, inputLen - 1);
+  } else {
+    dupInput += input.substr(0, inputLen);
+    ++truncBitPosition;
+  }
+
+  // Add 'carry' after truncation, if necessary
+  bool carry = dupInput[truncBitPosition] > '4';
+  for (int i = truncBitPosition - 1; i >= 0 && carry; i--) {
+    if (dupInput[i] == '.') continue;
+    if (dupInput[i] == '9')
+      dupInput[i] = '0';
+    else {
+      ++dupInput[i];
+      carry = false;
+    }
+  }
+
+  // bits outside precision range should be set to 0
+  if (dupInput[0] == '1')
+    FirstNonZeroPos = 0;
+  else {
+    FirstNonZeroPos = 0;
+    while (dupInput[FirstNonZeroPos] < '1' || dupInput[FirstNonZeroPos] > '9')
+      ++FirstNonZeroPos;
+  }
+
+  unsigned it = FirstNonZeroPos;
+  int NValidNumber = 0;
+  while (it < dupInput.length()) {
+    if (dupInput[it] == '.') {
+      ++it;
+      continue;
+    }
+    ++NValidNumber;
+    if (NValidNumber > precision) dupInput[it] = '0';
+    ++it;
+  }
+
+  // Here we wanted to adjust the truncate position and the value
+  decPosition = dupInput.find('.');
+  if (decPosition == std::string::npos) // When this is integer
+    truncBitPosition = (int)dupInput.length();
+  else
+    for (truncBitPosition = (int)(dupInput.length() - 1); truncBitPosition >= 0;
+         --truncBitPosition) {
+      if (dupInput[truncBitPosition] == '.') break;
+      if (dupInput[truncBitPosition] != '0') {
+        truncBitPosition++;
+        break;
+      }
+    }
+
+  if (dupInput[0] == '1')
+    dupInput = dupInput.substr(0, truncBitPosition);
+  else
+    dupInput = dupInput.substr(1, truncBitPosition - 1);
+
+  decPosition = dupInput.find('.');
+  if (decPosition != std::string::npos) {
+    size_t it = 0;
+    for (it = decPosition + 1; dupInput[it] == '0'; it++)
+      ;
+    if (it - decPosition - 1 < 4) {
+      ans += dupInput;
+      return ans;
+    } else {
+      ans += scientificFormat(dupInput);
+      return ans;
+    }
+  } else if ((int)(dupInput.length()) <= precision) {
+    ans += dupInput;
+    return ans;
+  }
+
+  ans += scientificFormat(dupInput);
+  return ans;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void print(
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  if (_AP_I > 0) {
+    ap_int_base<_AP_I, _AP_S> p1;
+    p1.V = x.V >> (_AP_W - _AP_I);
+    print(p1.V); // print overlaod for .V should exit
+  } else {
+    printf("0");
+  }
+  printf(".");
+  if (_AP_I < _AP_W) {
+    ap_int_base<_AP_W - _AP_I, false> p2;
+    p2.V = _AP_ROOT_op_get_range(x.V, 0, _AP_W - _AP_I);
+    print(p2.V, false); // print overlaod for .V should exit
+  }
+}
+#endif // ifndef __SYNTHESIS__
+
+// XXX the following two functions have to exist in synthesis,
+// as some old HLS Video Library code uses the ostream overload,
+// although HLS will later delete I/O function call.
+
+/// Output streaming
+//-----------------------------------------------------------------------------
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& out,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  // TODO support std::ios_base::fmtflags
+  unsigned width = out.width();
+  unsigned precision = out.precision();
+  char fill = out.fill();
+  std::string str = x.to_string(10, _AP_S);
+  str = reduceToPrecision(str, precision);
+  if (width > str.length()) {
+    for (unsigned i = 0; i < width - str.length(); ++i)
+      out << fill;
+  }
+  out << str;
+  return out;
+}
+#endif // ifndef __SYNTHESIS__
+
+/// Input streaming
+// -----------------------------------------------------------------------------
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::istream& operator>>(
+    std::istream& in,
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  double d;
+  in >> d;
+  x = ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(d);
+  return in;
+}
+#endif
+#endif // ifndef AP_AUTOCC
+
+/// Operators mixing Integers with ap_fixed_base
+// -----------------------------------------------------------------------------
+#define AF_BIN_OP_WITH_INT_SF(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)     \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,           \
+            ap_o_mode _AP_O, int _AP_N>                                  \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<    \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                     \
+  operator BIN_OP(                                                       \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \
+      C_TYPE i_op) {                                                     \
+    return op.operator BIN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op));        \
+  }
+
+#define AF_BIN_OP_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)           \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      C_TYPE i_op) {                                                        \
+    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      C_TYPE i_op,                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
+  }
+
+#define AF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE bool operator REL_OP(                                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      C_TYPE i_op) {                                                        \
+    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE bool operator REL_OP(                                              \
+      C_TYPE i_op,                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
+  }
+
+#define AF_ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)               \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N>                                        \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
+  operator ASSIGN_OP(                                                          \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
+      C_TYPE i_op) {                                                           \
+    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define AF_ASSIGN_OP_WITH_INT_SF(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)  \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,       \
+            ap_o_mode _AP_O, int _AP_N>                              \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&    \
+  operator ASSIGN_OP(                                                \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      C_TYPE i_op) {                                                 \
+    return op.operator ASSIGN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define ALL_AF_OP_WITH_INT(C_TYPE, BITS, SIGN)               \
+  AF_BIN_OP_WITH_INT(+, C_TYPE, (BITS), (SIGN), plus)     \
+  AF_BIN_OP_WITH_INT(-, C_TYPE, (BITS), (SIGN), minus)    \
+  AF_BIN_OP_WITH_INT(*, C_TYPE, (BITS), (SIGN), mult)     \
+  AF_BIN_OP_WITH_INT(/, C_TYPE, (BITS), (SIGN), div)      \
+  AF_BIN_OP_WITH_INT(&, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT(|, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT(^, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT_SF(>>, C_TYPE, (BITS), (SIGN), lhs)  \
+  AF_BIN_OP_WITH_INT_SF(<<, C_TYPE, (BITS), (SIGN), lhs)  \
+                                                          \
+  AF_ASSIGN_OP_WITH_INT(+=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(-=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(*=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(/=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(&=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(|=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(^=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT_SF(>>=, C_TYPE, (BITS), (SIGN)) \
+  AF_ASSIGN_OP_WITH_INT_SF(<<=, C_TYPE, (BITS), (SIGN)) \
+                                                          \
+  AF_REL_OP_WITH_INT(>, C_TYPE, (BITS), (SIGN))           \
+  AF_REL_OP_WITH_INT(<, C_TYPE, (BITS), (SIGN))           \
+  AF_REL_OP_WITH_INT(>=, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(<=, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(==, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(!=, C_TYPE, (BITS), (SIGN))
+
+ALL_AF_OP_WITH_INT(bool, 1, false)
+ALL_AF_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_AF_OP_WITH_INT(signed char, 8, true)
+ALL_AF_OP_WITH_INT(unsigned char, 8, false)
+ALL_AF_OP_WITH_INT(short, _AP_SIZE_short, true)
+ALL_AF_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_AF_OP_WITH_INT(int, _AP_SIZE_int, true)
+ALL_AF_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_AF_OP_WITH_INT(long, _AP_SIZE_long, true)
+ALL_AF_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_AF_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_AF_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ALL_AF_OP_WITH_INT
+#undef AF_BIN_OP_WITH_INT
+#undef AF_BIN_OP_WITH_INT_SF
+#undef AF_ASSIGN_OP_WITH_INT
+#undef AF_ASSIGN_OP_WITH_INT_SF
+#undef AF_REL_OP_WITH_INT
+
+/*
+ * **********************************************************************
+ * TODO
+ * There is no operator defined with float/double/long double, so that
+ * code like
+ *   ap_fixed<8,4> a = 1.5f;
+ *   a += 0.5f;
+ * will fail in compilation.
+ * Operator with warning about conversion might be wanted.
+ * **********************************************************************
+ */
+
+#define AF_BIN_OP_WITH_AP_INT(BIN_OP, RTYPE)                                \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType<    \
+      _AP_W, _AP_I, _AP_S>::RTYPE                                           \
+  operator BIN_OP(                                                          \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
+  }                                                                         \
+                                                                            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
+    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define AF_REL_OP_WITH_AP_INT(REL_OP)                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE bool operator REL_OP(                                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
+    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+                                                                            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE bool operator REL_OP(                                              \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
+  }
+
+#define AF_ASSIGN_OP_WITH_AP_INT(ASSIGN_OP)                                    \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
+  operator ASSIGN_OP(                                                          \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                               \
+    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                            \
+                                                                               \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
+  INLINE ap_int_base<_AP_W2, _AP_S2>& operator ASSIGN_OP(                      \
+      ap_int_base<_AP_W2, _AP_S2>& i_op,                                       \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {     \
+    return i_op.operator ASSIGN_OP(op.to_ap_int_base());                       \
+  }
+
+AF_BIN_OP_WITH_AP_INT(+, plus)
+AF_BIN_OP_WITH_AP_INT(-, minus)
+AF_BIN_OP_WITH_AP_INT(*, mult)
+AF_BIN_OP_WITH_AP_INT(/, div)
+AF_BIN_OP_WITH_AP_INT(&, logic)
+AF_BIN_OP_WITH_AP_INT(|, logic)
+AF_BIN_OP_WITH_AP_INT(^, logic)
+
+#undef AF_BIN_OP_WITH_AP_INT
+
+AF_ASSIGN_OP_WITH_AP_INT(+=)
+AF_ASSIGN_OP_WITH_AP_INT(-=)
+AF_ASSIGN_OP_WITH_AP_INT(*=)
+AF_ASSIGN_OP_WITH_AP_INT(/=)
+AF_ASSIGN_OP_WITH_AP_INT(&=)
+AF_ASSIGN_OP_WITH_AP_INT(|=)
+AF_ASSIGN_OP_WITH_AP_INT(^=)
+
+#undef AF_ASSIGN_OP_WITH_AP_INT
+
+AF_REL_OP_WITH_AP_INT(==)
+AF_REL_OP_WITH_AP_INT(!=)
+AF_REL_OP_WITH_AP_INT(>)
+AF_REL_OP_WITH_AP_INT(>=)
+AF_REL_OP_WITH_AP_INT(<)
+AF_REL_OP_WITH_AP_INT(<=)
+
+#undef AF_REL_OP_WITH_AP_INT
+
+// Relational Operators with double
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator==(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator==(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator!=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator!=(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator>(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator<(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator>=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator<=(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator<(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator>(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator<=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator>=(op1);
+}
+
+#endif // ifndef __cplusplus else
+
+#endif // ifndef __AP_FIXED_BASE_H__ else
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h b/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h
index aefda0a676..a1c2816c79 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h
@@ -1,718 +1,718 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_REF_H__
-#define __AP_FIXED_REF_H__
-
-#ifndef __AP_FIXED_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-
-#else
-#ifndef __SYNTHESIS__
-#include <iostream>
-#endif
-/// Proxy class, which allows bit selection  to be used as both rvalue (for
-/// reading) and lvalue (for writing)
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_bit_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
-  ref_type& d_bv;
-  int d_index;
-
- public:
-  INLINE af_bit_ref(
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
-      : d_bv(ref.d_bv), d_index(ref.d_index) {
-#ifndef __SYNTHESIS__
-    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.",
-                d_index);
-    _AP_WARNING(d_index >= _AP_W, "Index of bit vector (%d) out of range (%d).",
-                d_index, _AP_W);
-#endif
-  }
-
-  INLINE af_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
-
-  INLINE af_bit_ref(const ref_type* bv, int index = 0)
-      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
-
-  /// convert operators.
-  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  /// @name assign operators
-  //  @{
-  INLINE af_bit_ref& operator=(bool val) {
-    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
-    return *this;
-  }
-
-  // Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE af_bit_ref& operator=(const af_bit_ref& val) {
-    return operator=(bool(val));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_bit_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(bool(val));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=(bool(val));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
-    return operator=(val != 0);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    return operator=(ap_int_base<_AP_W2, false>(val));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_bit_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(ap_int_base<_AP_W2, false>(val));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE af_bit_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    return operator=(ap_int_base<_AP_W2 + _AP_W3, false>(val));
-  }
-  //  @}
-
-  /// @name concatenate operators
-  //  @{
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, op);
-  }
-
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
-      const ap_bit_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(*this,
-                                                                        op);
-  }
-
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, op);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
-    return ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-                                                                         op);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      1, af_bit_ref, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
-    return ap_concat_ref<
-        1, af_bit_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-                                                                       op);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                    _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
-    return ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                      _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            op));
-  }
-  //  @}
-
-  /// @name comparison
-  //  @{
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator==(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    return get() == op.get();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator!=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    return get() != op.get();
-  }
-  //  @}
-
-  INLINE bool operator~() const {
-    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
-    return bit ? false : true;
-  }
-
-  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  INLINE int length() const { return 1; }
-
-#ifndef __SYNTHESIS__
-  std::string to_string() const { return get() ? "1" : "0"; }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string() const { return 0; }
-#endif
-}; // struct af_bit_ref
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::ostream& operator<<(
-    std::ostream& os,
-    const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  os << x.to_string();
-  return os;
-}
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_AUTOCC
-
-/// Range (slice) reference.
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_range_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
-  ref_type& d_bv;
-  int l_index;
-  int h_index;
-
- public:
-  /// copy ctor
-  INLINE af_range_ref(
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
-      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
-
-  /// ctor from ap_fixed_base, higher and lower bound.
-  /** if h is less than l, the bits selected will be returned in reverse order.
-   */
-  INLINE af_range_ref(ref_type* bv, int h, int l)
-      : d_bv(*bv), l_index(l), h_index(h) {
-#ifndef __SYNTHESIS__
-    _AP_WARNING(h < 0 || l < 0,
-                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
-                l);
-    _AP_WARNING(h >= _AP_W || l >= _AP_W,
-                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
-    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
-#endif
-  }
-
-  INLINE af_range_ref(const ref_type* bv, int h, int l)
-      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {
-#ifndef __SYNTHESIS__
-    _AP_WARNING(h < 0 || l < 0,
-                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
-                l);
-    _AP_WARNING(h >= _AP_W || l >= _AP_W,
-                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
-    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
-#endif
-  }
-
-  /// @name assign operators
-  //  @{
-
-#define ASSIGN_CTYPE_TO_AF_RANGE(DATA_TYPE)                          \
-  INLINE af_range_ref& operator=(const DATA_TYPE val) {              \
-    ap_int_base<_AP_W, false> loc(val);                              \
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, loc.V); \
-    return *this;                                                    \
-  }
-
-  ASSIGN_CTYPE_TO_AF_RANGE(bool)
-  ASSIGN_CTYPE_TO_AF_RANGE(char)
-  ASSIGN_CTYPE_TO_AF_RANGE(signed char)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned char)
-  ASSIGN_CTYPE_TO_AF_RANGE(short)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned short)
-  ASSIGN_CTYPE_TO_AF_RANGE(int)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned int)
-  ASSIGN_CTYPE_TO_AF_RANGE(long)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned long)
-  ASSIGN_CTYPE_TO_AF_RANGE(ap_slong)
-  ASSIGN_CTYPE_TO_AF_RANGE(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_CTYPE_TO_AF_RANGE(half)
-#endif
-  ASSIGN_CTYPE_TO_AF_RANGE(float)
-  ASSIGN_CTYPE_TO_AF_RANGE(double)
-#undef ASSIGN_CTYPE_TO_AF_RANGE
-
-  /// assgin using a string. XXX crucial for cosim.
-  INLINE af_range_ref& operator=(const char* val) {
-    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-    return *this;
-  }
-
-  /// assign from ap_int_base.
-  // NOTE Base of other assgin operators.
-  template <int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-    return *this;
-  }
-
-  /// assign from range reference to ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    const ap_int_base<_AP_W2, false> tmp(val);
-    return operator=(tmp);
-  }
-
-  /// assign from bit reference to ap_int_base..
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    const ap_int_base<1, false> tmp((bool)val);
-    return operator=(tmp);
-  }
-
-  /// assgin from ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_range_ref& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-    return *this;
-  }
-
-  /// copy assgin.
-  // XXX This has to be explicit, otherwise it will be deleted, as d_bv is
-  // of reference type.
-  INLINE af_range_ref& operator=(const af_range_ref& val) {
-    ap_int_base<_AP_W, false> tmp(val);
-    return operator=(tmp);
-  }
-
-  /// assign from range reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_range_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    ap_int_base<_AP_W2, false> tmp(val);
-    return operator=(tmp);
-  }
-
-  /// assign from bit reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_range_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    ap_int_base<1, false> tmp((bool)val);
-    return operator=(tmp);
-  }
-
-  /// assign from compound reference.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE af_range_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    const ap_int_base<_AP_W2 + _AP_W3, false> tmp(val);
-    return operator=(tmp);
-  }
-  //  @}
-
-  /// @name comparison operators with ap_range_ref.
-  //  @{
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop == rop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator==(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop < rop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop > rop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator>(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator<(op2));
-  }
-  //  @}
-
-  /// @name comparison operators with af_range_ref.
-  //  @{
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator==(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop == rop;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator!=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    return !(operator==(op2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator<(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop < rop;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator>(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop > rop;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator<=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    return !(operator>(op2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator>=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    return !(operator<(op2));
-  }
-  //  @}
-
-  /// @name concatenate operators.
-  /// @{
-  /// concatenate with ap_int_base.
-  template <int _AP_W2, int _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, op);
-  }
-
-  /// concatenate with ap_bit_ref.
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(op));
-  }
-
-  /// concatenate with ap_bit_ref.
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(op));
-  }
-
-  /// concatenate with ap_concat_ref.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(op));
-  }
-
-  /// concatenate with another af_range_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
-                    af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                    &op) {
-    return ap_concat_ref<
-        _AP_W, af_range_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            op));
-  }
-
-  /// concatenate with another af_bit_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, af_range_ref, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(
-          const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
-    return ap_concat_ref<
-        _AP_W, af_range_ref, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            op));
-  }
-  //  @}
-
-  INLINE operator ap_ulong() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret.to_uint64();
-  }
-
-  INLINE operator ap_int_base<_AP_W, false>() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  INLINE ap_int_base<_AP_W, false> to_ap_int_base() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  // used in ap_fixed_base::to_string()
-  INLINE char to_char() const {
-    return (char)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE int to_int() const {
-    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned to_uint() const {
-    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE long to_long() const {
-    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned long to_ulong() const {
-    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_slong to_int64() const {
-    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_ulong to_uint64() const {
-    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_int_base<_AP_W, false> get() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  template <int _AP_W2>
-  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-  }
-
-  INLINE int length() const {
-    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
-  }
-
-#ifndef __SYNTHESIS__
-  std::string to_string(signed char rd = 2) const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret.to_string(rd);
-  }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string(signed char rd = 2) const {
-    return 0;
-  }
-#endif
-}; // struct af_range_ref
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::ostream& operator<<(
-    std::ostream& os,
-    const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  os << x.to_string();
-  return os;
-}
-#endif
-#endif // ifndef AP_AUTOCC
-
-#define AF_REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)            \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
-      C_TYPE op2) {                                                       \
-    return ap_int_base<_AP_W, false>(op)                                  \
-        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                          \
-  }                                                                       \
-                                                                          \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      C_TYPE op2,                                                         \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
-    return ap_int_base<_AP_W2, _AP_S2>(op2)                               \
-        REL_OP ap_int_base<_AP_W, false>(op);                             \
-  }                                                                       \
-                                                                          \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
-      C_TYPE op2) {                                                       \
-    return bool(op) REL_OP op2;                                           \
-  }                                                                       \
-                                                                          \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      C_TYPE op2,                                                         \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
-    return op2 REL_OP bool(op);                                           \
-  }
-
-#define AF_REF_REL_OPS_WITH_INT(C_TYPE, _AP_W2, _AP_S2)  \
-  AF_REF_REL_OP_WITH_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
-  AF_REF_REL_OP_WITH_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
-  AF_REF_REL_OP_WITH_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  AF_REF_REL_OP_WITH_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  AF_REF_REL_OP_WITH_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
-  AF_REF_REL_OP_WITH_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
-
-AF_REF_REL_OPS_WITH_INT(bool, 1, false)
-AF_REF_REL_OPS_WITH_INT(char, 8, CHAR_IS_SIGNED)
-AF_REF_REL_OPS_WITH_INT(signed char, 8, true)
-AF_REF_REL_OPS_WITH_INT(unsigned char, 8, false)
-AF_REF_REL_OPS_WITH_INT(short, _AP_SIZE_short, true)
-AF_REF_REL_OPS_WITH_INT(unsigned short, _AP_SIZE_short, false)
-AF_REF_REL_OPS_WITH_INT(int, _AP_SIZE_int, true)
-AF_REF_REL_OPS_WITH_INT(unsigned int, _AP_SIZE_int, false)
-AF_REF_REL_OPS_WITH_INT(long, _AP_SIZE_long, true)
-AF_REF_REL_OPS_WITH_INT(unsigned long, _AP_SIZE_long, false)
-AF_REF_REL_OPS_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-AF_REF_REL_OPS_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef AF_REF_REL_OP_INT
-#undef AF_REF_REL_OPS_WITH_INT
-
-#define AF_REF_REL_OP_WITH_AP_INT(REL_OP)                                 \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
-      const ap_int_base<_AP_W2, _AP_S>& op2) {                            \
-    return ap_int_base<_AP_W, false>(op) REL_OP op2;                      \
-  }                                                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
-    return op2 REL_OP ap_int_base<_AP_W, false>(op);                      \
-  }                                                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {                           \
-    return ap_int_base<1, false>(op) REL_OP op2;                          \
-  }                                                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
-    return op2 REL_OP ap_int_base<1, false>(op);                          \
-  }
-
-AF_REF_REL_OP_WITH_AP_INT(>)
-AF_REF_REL_OP_WITH_AP_INT(<)
-AF_REF_REL_OP_WITH_AP_INT(>=)
-AF_REF_REL_OP_WITH_AP_INT(<=)
-AF_REF_REL_OP_WITH_AP_INT(==)
-AF_REF_REL_OP_WITH_AP_INT(!=)
-
-#endif // ifndef __cplusplus
-
-#endif // ifndef __AP_FIXED_REF_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_REF_H__
+#define __AP_FIXED_REF_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+
+#else
+#ifndef __SYNTHESIS__
+#include <iostream>
+#endif
+/// Proxy class, which allows bit selection  to be used as both rvalue (for
+/// reading) and lvalue (for writing)
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_bit_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
+  ref_type& d_bv;
+  int d_index;
+
+ public:
+  INLINE af_bit_ref(
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.",
+                d_index);
+    _AP_WARNING(d_index >= _AP_W, "Index of bit vector (%d) out of range (%d).",
+                d_index, _AP_W);
+#endif
+  }
+
+  INLINE af_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
+
+  INLINE af_bit_ref(const ref_type* bv, int index = 0)
+      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
+
+  /// convert operators.
+  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  /// @name assign operators
+  //  @{
+  INLINE af_bit_ref& operator=(bool val) {
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
+    return *this;
+  }
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE af_bit_ref& operator=(const af_bit_ref& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_bit_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    return operator=(val != 0);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=(ap_int_base<_AP_W2, false>(val));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_bit_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(ap_int_base<_AP_W2, false>(val));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE af_bit_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=(ap_int_base<_AP_W2 + _AP_W3, false>(val));
+  }
+  //  @}
+
+  /// @name concatenate operators
+  //  @{
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, op);
+  }
+
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
+      const ap_bit_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(*this,
+                                                                        op);
+  }
+
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, op);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+                                                                         op);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      1, af_bit_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<
+        1, af_bit_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+                                                                       op);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                    _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                      _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+  //  @}
+
+  /// @name comparison
+  //  @{
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator==(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator!=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    return get() != op.get();
+  }
+  //  @}
+
+  INLINE bool operator~() const {
+    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
+    return bit ? false : true;
+  }
+
+  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  INLINE int length() const { return 1; }
+
+#ifndef __SYNTHESIS__
+  std::string to_string() const { return get() ? "1" : "0"; }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string() const { return 0; }
+#endif
+}; // struct af_bit_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& os,
+    const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  os << x.to_string();
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/// Range (slice) reference.
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_range_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
+  ref_type& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  /// copy ctor
+  INLINE af_range_ref(
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  /// ctor from ap_fixed_base, higher and lower bound.
+  /** if h is less than l, the bits selected will be returned in reverse order.
+   */
+  INLINE af_range_ref(ref_type* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
+                l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
+    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
+#endif
+  }
+
+  INLINE af_range_ref(const ref_type* bv, int h, int l)
+      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
+                l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
+    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
+#endif
+  }
+
+  /// @name assign operators
+  //  @{
+
+#define ASSIGN_CTYPE_TO_AF_RANGE(DATA_TYPE)                          \
+  INLINE af_range_ref& operator=(const DATA_TYPE val) {              \
+    ap_int_base<_AP_W, false> loc(val);                              \
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, loc.V); \
+    return *this;                                                    \
+  }
+
+  ASSIGN_CTYPE_TO_AF_RANGE(bool)
+  ASSIGN_CTYPE_TO_AF_RANGE(char)
+  ASSIGN_CTYPE_TO_AF_RANGE(signed char)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned char)
+  ASSIGN_CTYPE_TO_AF_RANGE(short)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned short)
+  ASSIGN_CTYPE_TO_AF_RANGE(int)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned int)
+  ASSIGN_CTYPE_TO_AF_RANGE(long)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned long)
+  ASSIGN_CTYPE_TO_AF_RANGE(ap_slong)
+  ASSIGN_CTYPE_TO_AF_RANGE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_CTYPE_TO_AF_RANGE(half)
+#endif
+  ASSIGN_CTYPE_TO_AF_RANGE(float)
+  ASSIGN_CTYPE_TO_AF_RANGE(double)
+#undef ASSIGN_CTYPE_TO_AF_RANGE
+
+  /// assgin using a string. XXX crucial for cosim.
+  INLINE af_range_ref& operator=(const char* val) {
+    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// assign from ap_int_base.
+  // NOTE Base of other assgin operators.
+  template <int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+    return *this;
+  }
+
+  /// assign from range reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    const ap_int_base<_AP_W2, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from bit reference to ap_int_base..
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    const ap_int_base<1, false> tmp((bool)val);
+    return operator=(tmp);
+  }
+
+  /// assgin from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+    return *this;
+  }
+
+  /// copy assgin.
+  // XXX This has to be explicit, otherwise it will be deleted, as d_bv is
+  // of reference type.
+  INLINE af_range_ref& operator=(const af_range_ref& val) {
+    ap_int_base<_AP_W, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from range reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    ap_int_base<_AP_W2, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from bit reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    ap_int_base<1, false> tmp((bool)val);
+    return operator=(tmp);
+  }
+
+  /// assign from compound reference.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE af_range_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    const ap_int_base<_AP_W2 + _AP_W3, false> tmp(val);
+    return operator=(tmp);
+  }
+  //  @}
+
+  /// @name comparison operators with ap_range_ref.
+  //  @{
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop == rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop < rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop > rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator>(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<(op2));
+  }
+  //  @}
+
+  /// @name comparison operators with af_range_ref.
+  //  @{
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator==(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop == rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator!=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator<(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop < rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator>(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop > rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator<=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator>(op2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator>=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator<(op2));
+  }
+  //  @}
+
+  /// @name concatenate operators.
+  /// @{
+  /// concatenate with ap_int_base.
+  template <int _AP_W2, int _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, op);
+  }
+
+  /// concatenate with ap_bit_ref.
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(op));
+  }
+
+  /// concatenate with ap_bit_ref.
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(op));
+  }
+
+  /// concatenate with ap_concat_ref.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(op));
+  }
+
+  /// concatenate with another af_range_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                    af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &op) {
+    return ap_concat_ref<
+        _AP_W, af_range_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+
+  /// concatenate with another af_bit_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(
+          const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<
+        _AP_W, af_range_ref, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+  //  @}
+
+  INLINE operator ap_ulong() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_uint64();
+  }
+
+  INLINE operator ap_int_base<_AP_W, false>() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  INLINE ap_int_base<_AP_W, false> to_ap_int_base() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  // used in ap_fixed_base::to_string()
+  INLINE char to_char() const {
+    return (char)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE int to_int() const {
+    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned to_uint() const {
+    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE long to_long() const {
+    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned long to_ulong() const {
+    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_slong to_int64() const {
+    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_int_base<_AP_W, false> get() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char rd = 2) const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_string(rd);
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(signed char rd = 2) const {
+    return 0;
+  }
+#endif
+}; // struct af_range_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& os,
+    const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  os << x.to_string();
+  return os;
+}
+#endif
+#endif // ifndef AP_AUTOCC
+
+#define AF_REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      C_TYPE op2) {                                                       \
+    return ap_int_base<_AP_W, false>(op)                                  \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                          \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      C_TYPE op2,                                                         \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
+    return ap_int_base<_AP_W2, _AP_S2>(op2)                               \
+        REL_OP ap_int_base<_AP_W, false>(op);                             \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
+      C_TYPE op2) {                                                       \
+    return bool(op) REL_OP op2;                                           \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      C_TYPE op2,                                                         \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
+    return op2 REL_OP bool(op);                                           \
+  }
+
+#define AF_REF_REL_OPS_WITH_INT(C_TYPE, _AP_W2, _AP_S2)  \
+  AF_REF_REL_OP_WITH_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
+  AF_REF_REL_OP_WITH_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
+  AF_REF_REL_OP_WITH_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+AF_REF_REL_OPS_WITH_INT(bool, 1, false)
+AF_REF_REL_OPS_WITH_INT(char, 8, CHAR_IS_SIGNED)
+AF_REF_REL_OPS_WITH_INT(signed char, 8, true)
+AF_REF_REL_OPS_WITH_INT(unsigned char, 8, false)
+AF_REF_REL_OPS_WITH_INT(short, _AP_SIZE_short, true)
+AF_REF_REL_OPS_WITH_INT(unsigned short, _AP_SIZE_short, false)
+AF_REF_REL_OPS_WITH_INT(int, _AP_SIZE_int, true)
+AF_REF_REL_OPS_WITH_INT(unsigned int, _AP_SIZE_int, false)
+AF_REF_REL_OPS_WITH_INT(long, _AP_SIZE_long, true)
+AF_REF_REL_OPS_WITH_INT(unsigned long, _AP_SIZE_long, false)
+AF_REF_REL_OPS_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+AF_REF_REL_OPS_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef AF_REF_REL_OP_INT
+#undef AF_REF_REL_OPS_WITH_INT
+
+#define AF_REF_REL_OP_WITH_AP_INT(REL_OP)                                 \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      const ap_int_base<_AP_W2, _AP_S>& op2) {                            \
+    return ap_int_base<_AP_W, false>(op) REL_OP op2;                      \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
+    return op2 REL_OP ap_int_base<_AP_W, false>(op);                      \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {                           \
+    return ap_int_base<1, false>(op) REL_OP op2;                          \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
+    return op2 REL_OP ap_int_base<1, false>(op);                          \
+  }
+
+AF_REF_REL_OP_WITH_AP_INT(>)
+AF_REF_REL_OP_WITH_AP_INT(<)
+AF_REF_REL_OP_WITH_AP_INT(>=)
+AF_REF_REL_OP_WITH_AP_INT(<=)
+AF_REF_REL_OP_WITH_AP_INT(==)
+AF_REF_REL_OP_WITH_AP_INT(!=)
+
+#endif // ifndef __cplusplus
+
+#endif // ifndef __AP_FIXED_REF_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed_special.h b/hls4ml/templates/vivado/ap_types/ap_fixed_special.h
index 0f7a9f7eb3..5c09f247bd 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed_special.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed_special.h
@@ -1,230 +1,230 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_SPECIAL_H__
-#define __AP_FIXED_SPECIAL_H__
-
-#ifndef __AP_FIXED_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __SYNTHESIS__
-#include <cstdio>
-#include <cstdlib>
-#endif
-// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
-// include.
-// #include <complex>
-namespace std {
-template<typename _Tp> class complex;
-}
-
-/*
-  TODO: Modernize the code using C++11/C++14
-  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
-  2. move constructor
-*/
-
-namespace std {
-/*
-   Specialize std::complex<ap_fixed> to zero initialization ap_fixed.
-
-   To reduce the area cost, ap_fixed is not zero initialized, just like basic
-   types float or double. However, libstdc++ provides specialization for float,
-   double and long double, initializing image part to 0 when not specified.
-
-   This has become a difficulty in switching legacy code from these C types to
-   ap_fixed. To ease the tranform of legacy code, we have to implement
-   specialization of std::complex<> for our type.
-
-   As ap_fixed is a template, it is impossible to specialize only the methods
-   that causes default initialization of value type in std::complex<>. An
-   explicit full specialization of the template class has to be done, covering
-   all the member functions and operators of std::complex<> as specified
-   in standard 26.2.4 and 26.2.5.
-*/
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > {
- public:
-  typedef ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp;
-  typedef _Tp value_type;
-
-  // 26.2.4/1
-  // Constructor without argument
-  // Default initialize, so that in dataflow, the variable is only written once.
-  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
-  // Constructor with ap_fixed.
-  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
-  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
-      : _M_real(__r), _M_imag(__i) {}
-
-  // Constructor with another complex number
-  template <typename _Up>
-  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
-
-#if __cplusplus >= 201103L
-  const _Tp& real() const { return _M_real; }
-  const _Tp& imag() const { return _M_imag; }
-#else
-  _Tp& real() { return _M_real; }
-  const _Tp& real() const { return _M_real; }
-  _Tp& imag() { return _M_imag; }
-  const _Tp& imag() const { return _M_imag; }
-#endif
- 
-  void real(_Tp __val) { _M_real = __val; }
-
-  void imag(_Tp __val) { _M_imag = __val; }
-
-  // Assign this complex number with ap_fixed.
-  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
-  complex<_Tp> &operator=(const _Tp __t) {
-    _M_real = __t;
-    _M_imag = _Tp(0);
-    return *this;
-  }
-
-  // 26.2.5/1
-  // Add ap_fixed to this complex number.
-  complex<_Tp> &operator+=(const _Tp &__t) {
-    _M_real += __t;
-    return *this;
-  }
-
-  // 26.2.5/3
-  // Subtract ap_fixed from this complex number.
-  complex<_Tp> &operator-=(const _Tp &__t) {
-    _M_real -= __t;
-    return *this;
-  }
-
-  // 26.2.5/5
-  // Multiply this complex number by ap_fixed.
-  complex<_Tp> &operator*=(const _Tp &__t) {
-    _M_real *= __t;
-    _M_imag *= __t;
-    return *this;
-  }
-
-  // 26.2.5/7
-  // Divide this complex number by ap_fixed.
-  complex<_Tp> &operator/=(const _Tp &__t) {
-    _M_real /= __t;
-    _M_imag /= __t;
-    return *this;
-  }
-
-  // Assign complex number to this complex number.
-  template <typename _Up>
-  complex<_Tp> &operator=(const complex<_Up> &__z) {
-    _M_real = __z.real();
-    _M_imag = __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/9
-  // Add complex number to this.
-  template <typename _Up>
-  complex<_Tp> &operator+=(const complex<_Up> &__z) {
-    _M_real += __z.real();
-    _M_imag += __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/11
-  // Subtract complex number from this.
-  template <typename _Up>
-  complex<_Tp> &operator-=(const complex<_Up> &__z) {
-    _M_real -= __z.real();
-    _M_imag -= __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/13
-  // Multiply this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator*=(const complex<_Up> &__z) {
-    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
-    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
-    _M_real = __r;
-    return *this;
-  }
-
-  // 26.2.5/15
-  // Divide this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator/=(const complex<_Up> &__z) {
-    complex<_Tp> cj (__z.real(), -__z.imag());
-    complex<_Tp> a = (*this) * cj;
-    complex<_Tp> b = cj * __z;
-    _M_real = a.real() / b.real();
-    _M_imag = a.imag() / b.real();
-    return *this;
-  }
-
- private:
-  _Tp _M_real;
-  _Tp _M_imag;
-
-}; // class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> >
-
-/*
-   Non-member operations
-   These operations are not required by standard in 26.2.6, but libstdc++
-   defines them for
-   float, double or long double's specialization.
-*/
-// Compare complex number with ap_fixed.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator==(
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
-  return __x.real() == __y &&
-         __x.imag() == 0;
-}
-
-// Compare ap_fixed with complex number.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator==(
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
-  return __x == __y.real() &&
-         0 == __y.imag();
-}
-
-// Compare complex number with ap_fixed.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator!=(
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
-  return __x.real() != __y ||
-         __x.imag() != 0;
-}
-
-// Compare ap_fixed with complex number.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator!=(
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
-  return __x != __y.real() ||
-         0 != __y.imag();
-}
-
-}  // namespace std
-
-#endif  // ifndef __AP_FIXED_SPECIAL_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_SPECIAL_H__
+#define __AP_FIXED_SPECIAL_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __SYNTHESIS__
+#include <cstdio>
+#include <cstdlib>
+#endif
+// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
+// include.
+// #include <complex>
+namespace std {
+template<typename _Tp> class complex;
+}
+
+/*
+  TODO: Modernize the code using C++11/C++14
+  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
+  2. move constructor
+*/
+
+namespace std {
+/*
+   Specialize std::complex<ap_fixed> to zero initialization ap_fixed.
+
+   To reduce the area cost, ap_fixed is not zero initialized, just like basic
+   types float or double. However, libstdc++ provides specialization for float,
+   double and long double, initializing image part to 0 when not specified.
+
+   This has become a difficulty in switching legacy code from these C types to
+   ap_fixed. To ease the tranform of legacy code, we have to implement
+   specialization of std::complex<> for our type.
+
+   As ap_fixed is a template, it is impossible to specialize only the methods
+   that causes default initialization of value type in std::complex<>. An
+   explicit full specialization of the template class has to be done, covering
+   all the member functions and operators of std::complex<> as specified
+   in standard 26.2.4 and 26.2.5.
+*/
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > {
+ public:
+  typedef ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp;
+  typedef _Tp value_type;
+
+  // 26.2.4/1
+  // Constructor without argument
+  // Default initialize, so that in dataflow, the variable is only written once.
+  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
+  // Constructor with ap_fixed.
+  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
+  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
+      : _M_real(__r), _M_imag(__i) {}
+
+  // Constructor with another complex number
+  template <typename _Up>
+  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
+
+#if __cplusplus >= 201103L
+  const _Tp& real() const { return _M_real; }
+  const _Tp& imag() const { return _M_imag; }
+#else
+  _Tp& real() { return _M_real; }
+  const _Tp& real() const { return _M_real; }
+  _Tp& imag() { return _M_imag; }
+  const _Tp& imag() const { return _M_imag; }
+#endif
+ 
+  void real(_Tp __val) { _M_real = __val; }
+
+  void imag(_Tp __val) { _M_imag = __val; }
+
+  // Assign this complex number with ap_fixed.
+  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
+  complex<_Tp> &operator=(const _Tp __t) {
+    _M_real = __t;
+    _M_imag = _Tp(0);
+    return *this;
+  }
+
+  // 26.2.5/1
+  // Add ap_fixed to this complex number.
+  complex<_Tp> &operator+=(const _Tp &__t) {
+    _M_real += __t;
+    return *this;
+  }
+
+  // 26.2.5/3
+  // Subtract ap_fixed from this complex number.
+  complex<_Tp> &operator-=(const _Tp &__t) {
+    _M_real -= __t;
+    return *this;
+  }
+
+  // 26.2.5/5
+  // Multiply this complex number by ap_fixed.
+  complex<_Tp> &operator*=(const _Tp &__t) {
+    _M_real *= __t;
+    _M_imag *= __t;
+    return *this;
+  }
+
+  // 26.2.5/7
+  // Divide this complex number by ap_fixed.
+  complex<_Tp> &operator/=(const _Tp &__t) {
+    _M_real /= __t;
+    _M_imag /= __t;
+    return *this;
+  }
+
+  // Assign complex number to this complex number.
+  template <typename _Up>
+  complex<_Tp> &operator=(const complex<_Up> &__z) {
+    _M_real = __z.real();
+    _M_imag = __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/9
+  // Add complex number to this.
+  template <typename _Up>
+  complex<_Tp> &operator+=(const complex<_Up> &__z) {
+    _M_real += __z.real();
+    _M_imag += __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/11
+  // Subtract complex number from this.
+  template <typename _Up>
+  complex<_Tp> &operator-=(const complex<_Up> &__z) {
+    _M_real -= __z.real();
+    _M_imag -= __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/13
+  // Multiply this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator*=(const complex<_Up> &__z) {
+    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
+    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
+    _M_real = __r;
+    return *this;
+  }
+
+  // 26.2.5/15
+  // Divide this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator/=(const complex<_Up> &__z) {
+    complex<_Tp> cj (__z.real(), -__z.imag());
+    complex<_Tp> a = (*this) * cj;
+    complex<_Tp> b = cj * __z;
+    _M_real = a.real() / b.real();
+    _M_imag = a.imag() / b.real();
+    return *this;
+  }
+
+ private:
+  _Tp _M_real;
+  _Tp _M_imag;
+
+}; // class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> >
+
+/*
+   Non-member operations
+   These operations are not required by standard in 26.2.6, but libstdc++
+   defines them for
+   float, double or long double's specialization.
+*/
+// Compare complex number with ap_fixed.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator==(
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
+  return __x.real() == __y &&
+         __x.imag() == 0;
+}
+
+// Compare ap_fixed with complex number.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator==(
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
+  return __x == __y.real() &&
+         0 == __y.imag();
+}
+
+// Compare complex number with ap_fixed.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator!=(
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
+  return __x.real() != __y ||
+         __x.imag() != 0;
+}
+
+// Compare ap_fixed with complex number.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator!=(
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
+  return __x != __y.real() ||
+         0 != __y.imag();
+}
+
+}  // namespace std
+
+#endif  // ifndef __AP_FIXED_SPECIAL_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int.h b/hls4ml/templates/vivado/ap_types/ap_int.h
index db3044d48c..d103795b46 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int.h
@@ -1,330 +1,330 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_H__
-#define __AP_INT_H__
-
-#include <ap_common.h>
-#include <ap_int_base.h>
-#include <ap_int_ref.h>
-
-//---------------------------------------------------------------
-
-/// Sign Arbitrary Precision Type.
-template <int _AP_W>
-struct ap_int : ap_int_base<_AP_W, true> {
-  typedef ap_int_base<_AP_W, true> Base;
-  // Constructor
-  INLINE ap_int() : Base() {}
-
-  // Copy ctor
-  INLINE ap_int(const ap_int& op) { Base::V = op.V; }
-
-  template <int _AP_W2>
-  INLINE ap_int(const ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int(const volatile ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int(const ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int(const volatile ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
-      : Base(ref) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(
-      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(
-      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-#define CTOR(TYPE) \
-  INLINE ap_int(TYPE val) { Base::V = val; }
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#undef CTOR
-  ap_int(double val) : Base(val) {}
-  ap_int(float val) : Base(val) {}
-#if _AP_ENABLE_HALF_ == 1
-  ap_int(half val) : Base(val) {}
-#endif
-
-  // ap_int_base will guess radix if radix is not provided.
-  INLINE ap_int(const char* s) : Base(s) {}
-
-  INLINE ap_int(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  /* ctor will be used when right is not of proper type. */
-
-  INLINE ap_int& operator=(const ap_int<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot bind volatile reference to non-volatile type. */
-  INLINE ap_int& operator=(const volatile ap_int<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot return volatile *this. */
-  INLINE void operator=(const ap_int<_AP_W>& op2) volatile { Base::V = op2.V; }
-
-  INLINE void operator=(const volatile ap_int<_AP_W>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-}; // struct ap_int.
-
-//---------------------------------------------------------------
-
-/// Unsigned Arbitrary Precision Type.
-template <int _AP_W>
-struct ap_uint : ap_int_base<_AP_W, false> {
-  typedef ap_int_base<_AP_W, false> Base;
-  // Constructor
-  INLINE ap_uint() : Base() {}
-
-  // Copy ctor
-  INLINE ap_uint(const ap_uint& op) { Base::V = op.V; }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const volatile ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const volatile ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_uint(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_uint(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_uint(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
-      : Base(ref) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(
-      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(
-      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_uint(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_uint(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_uint(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_uint(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-#define CTOR(TYPE) \
-  INLINE ap_uint(TYPE val) { Base::V = val; }
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#undef CTOR
-  ap_uint(double val) : Base(val) {}
-  ap_uint(float val) : Base(val) {}
-#if _AP_ENABLE_HALF_ == 1
-  ap_uint(half val) : Base(val) {}
-#endif
-
-  // ap_int_base will guess radix if radix is not provided.
-  INLINE ap_uint(const char* s) : Base(s) {}
-
-  INLINE ap_uint(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  /* XXX ctor will be used when right is not of proper type. */
-
-  INLINE ap_uint& operator=(const ap_uint<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot bind volatile reference to non-volatile type. */
-  INLINE ap_uint& operator=(const volatile ap_uint<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot return volatile *this. */
-  INLINE void operator=(const ap_uint<_AP_W>& op2) volatile { Base::V = op2.V; }
-
-  INLINE void operator=(const volatile ap_uint<_AP_W>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-}; // struct ap_uint.
-
-#define ap_bigint ap_int
-#define ap_biguint ap_uint
-
-#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
-// XXX sc_trace overload for ap_fixed is already included in
-// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
-template <int _AP_W>
-INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_int<_AP_W>& op,
-                     const std::string& name) {
-  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-
-template <int _AP_W>
-INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_uint<_AP_W>& op,
-                     const std::string& name) {
-  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-#endif // System C sim
-
-#include <ap_int_special.h>
-
-#endif // ifndef __AP_INT_H__ else
-
-// FIXME user should include ap_fixed.h when using ap_fixed.
-// to avoid circular inclusion, must check whether this is required by
-// ap_fixed.h
-#ifndef __AP_FIXED_H__
-#include <ap_fixed.h>
-#endif
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_H__
+#define __AP_INT_H__
+
+#include <ap_common.h>
+#include <ap_int_base.h>
+#include <ap_int_ref.h>
+
+//---------------------------------------------------------------
+
+/// Sign Arbitrary Precision Type.
+template <int _AP_W>
+struct ap_int : ap_int_base<_AP_W, true> {
+  typedef ap_int_base<_AP_W, true> Base;
+  // Constructor
+  INLINE ap_int() : Base() {}
+
+  // Copy ctor
+  INLINE ap_int(const ap_int& op) { Base::V = op.V; }
+
+  template <int _AP_W2>
+  INLINE ap_int(const ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const volatile ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const volatile ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
+      : Base(ref) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(
+      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(
+      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_int(TYPE val) { Base::V = val; }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#undef CTOR
+  ap_int(double val) : Base(val) {}
+  ap_int(float val) : Base(val) {}
+#if _AP_ENABLE_HALF_ == 1
+  ap_int(half val) : Base(val) {}
+#endif
+
+  // ap_int_base will guess radix if radix is not provided.
+  INLINE ap_int(const char* s) : Base(s) {}
+
+  INLINE ap_int(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  /* ctor will be used when right is not of proper type. */
+
+  INLINE ap_int& operator=(const ap_int<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot bind volatile reference to non-volatile type. */
+  INLINE ap_int& operator=(const volatile ap_int<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot return volatile *this. */
+  INLINE void operator=(const ap_int<_AP_W>& op2) volatile { Base::V = op2.V; }
+
+  INLINE void operator=(const volatile ap_int<_AP_W>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+}; // struct ap_int.
+
+//---------------------------------------------------------------
+
+/// Unsigned Arbitrary Precision Type.
+template <int _AP_W>
+struct ap_uint : ap_int_base<_AP_W, false> {
+  typedef ap_int_base<_AP_W, false> Base;
+  // Constructor
+  INLINE ap_uint() : Base() {}
+
+  // Copy ctor
+  INLINE ap_uint(const ap_uint& op) { Base::V = op.V; }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const volatile ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const volatile ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_uint(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
+      : Base(ref) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(
+      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(
+      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_uint(TYPE val) { Base::V = val; }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#undef CTOR
+  ap_uint(double val) : Base(val) {}
+  ap_uint(float val) : Base(val) {}
+#if _AP_ENABLE_HALF_ == 1
+  ap_uint(half val) : Base(val) {}
+#endif
+
+  // ap_int_base will guess radix if radix is not provided.
+  INLINE ap_uint(const char* s) : Base(s) {}
+
+  INLINE ap_uint(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  /* XXX ctor will be used when right is not of proper type. */
+
+  INLINE ap_uint& operator=(const ap_uint<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot bind volatile reference to non-volatile type. */
+  INLINE ap_uint& operator=(const volatile ap_uint<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot return volatile *this. */
+  INLINE void operator=(const ap_uint<_AP_W>& op2) volatile { Base::V = op2.V; }
+
+  INLINE void operator=(const volatile ap_uint<_AP_W>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+}; // struct ap_uint.
+
+#define ap_bigint ap_int
+#define ap_biguint ap_uint
+
+#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
+// XXX sc_trace overload for ap_fixed is already included in
+// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
+template <int _AP_W>
+INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_int<_AP_W>& op,
+                     const std::string& name) {
+  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+
+template <int _AP_W>
+INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_uint<_AP_W>& op,
+                     const std::string& name) {
+  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+#endif // System C sim
+
+#include <ap_int_special.h>
+
+#endif // ifndef __AP_INT_H__ else
+
+// FIXME user should include ap_fixed.h when using ap_fixed.
+// to avoid circular inclusion, must check whether this is required by
+// ap_fixed.h
+#ifndef __AP_FIXED_H__
+#include <ap_fixed.h>
+#endif
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int_base.h b/hls4ml/templates/vivado/ap_types/ap_int_base.h
index 091552a881..bb7e286ab6 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int_base.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int_base.h
@@ -1,1885 +1,1885 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_BASE_H__
-#define __AP_INT_BASE_H__
-
-#ifndef __AP_INT_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-#else
-
-#include <ap_common.h>
-#ifndef __SYNTHESIS__
-#if _AP_ENABLE_HALF_ == 1
-#include <hls_half.h>
-#endif
-#include <iostream>
-#include <string.h>
-#endif
-
-/* ----------------------------------------------------------------
- * ap_int_base: AutoPilot integer/Arbitrary precision integer.
- * ----------------------------------------------------------------
- */
-
-/* helper trait. Selecting the smallest C type that can hold the value,
- * return 64 bit C type if not possible.
- */
-template <int _AP_N, bool _AP_S>
-struct retval;
-
-// at least 64 bit
-template <int _AP_N>
-struct retval<_AP_N, true> {
-  typedef ap_slong Type;
-};
-
-template <int _AP_N>
-struct retval<_AP_N, false> {
-  typedef ap_ulong Type;
-};
-
-// at least 8 bit
-template <>
-struct retval<1, true> {
-  typedef signed char Type;
-};
-
-template <>
-struct retval<1, false> {
-  typedef unsigned char Type;
-};
-
-// at least 16 bit
-template <>
-struct retval<2, true> {
-  typedef short Type;
-};
-
-template <>
-struct retval<2, false> {
-  typedef unsigned short Type;
-};
-
-// at least 32 bit
-template <>
-struct retval<3, true> {
-  typedef long Type;
-};
-
-template <>
-struct retval<3, false> {
-  typedef unsigned long Type;
-};
-
-template <>
-struct retval<4, true> {
-  typedef long Type;
-};
-
-template <>
-struct retval<4, false> {
-  typedef unsigned long Type;
-};
-
-// trait for letting base class to return derived class.
-// Notice that derived class template is incomplete, and we cannot use
-// the member of the derived class.
-template <int _AP_W2, bool _AP_S2>
-struct _ap_int_factory;
-template <int _AP_W2>
-struct _ap_int_factory<_AP_W2,true> { typedef ap_int<_AP_W2> type; };
-template <int _AP_W2>
-struct _ap_int_factory<_AP_W2,false> { typedef ap_uint<_AP_W2> type; };
-
-template <int _AP_W, bool _AP_S>
-struct ap_int_base : public _AP_ROOT_TYPE<_AP_W, _AP_S> {
- public:
-  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
-
-  /* ap_int_base<_AP_W, _AP_S, true>
-   * typedef typename retval<(_AP_W + 7) / 8, _AP_S>::Type RetType;
-   *
-   * ap_int_base<_AP_W, _AP_S, false>
-   * typedef typename retval<8, _AP_S>::Type RetType;
-   */
-  typedef typename retval<AP_MAX((_AP_W + 7) / 8, 8), _AP_S>::Type RetType;
-
-  static const int width = _AP_W;
-
-  template <int _AP_W2, bool _AP_S2>
-  struct RType {
-    enum {
-      mult_w = _AP_W + _AP_W2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-      div_w = _AP_W + _AP_S2,
-      div_s = _AP_S || _AP_S2,
-      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
-      mod_s = _AP_S,
-      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-
-
-    typedef ap_int_base<mult_w, mult_s> mult_base;
-    typedef ap_int_base<plus_w, plus_s> plus_base;
-    typedef ap_int_base<minus_w, minus_s> minus_base;
-    typedef ap_int_base<logic_w, logic_s> logic_base;
-    typedef ap_int_base<div_w, div_s> div_base;
-    typedef ap_int_base<mod_w, mod_s> mod_base;
-    typedef ap_int_base<_AP_W, _AP_S> arg1_base;
-
-    typedef typename _ap_int_factory<mult_w, mult_s>::type mult;
-    typedef typename _ap_int_factory<plus_w, plus_s>::type plus;
-    typedef typename _ap_int_factory<minus_w, minus_s>::type minus;
-    typedef typename _ap_int_factory<logic_w, logic_s>::type logic;
-    typedef typename _ap_int_factory<div_w, div_s>::type div;
-    typedef typename _ap_int_factory<mod_w, mod_s>::type mod;
-    typedef typename _ap_int_factory<_AP_W, _AP_S>::type arg1;
-    typedef bool reduce;
-  };
-
-  /* Constructors.
-   * ----------------------------------------------------------------
-   */
-  /// default ctor
-  INLINE ap_int_base() {
-    /*
-      #ifdef __SC_COMPATIBLE__
-      Base::V = 0;
-      #endif
-    */
-  }
-
-  /// copy ctor
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-  /// volatile copy ctor
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-// XXX C++11 feature.
-// The explicit specifier specifies that a constructor or conversion function
-// (since C++11) doesn't allow implicit conversions or copy-initialization.
-//   ap_int_base<W,S> x = 1;
-//   ap_int_base<W,S> foo() { return 1; }
-// but allows
-//   ap_int_base<W,S> x(1);
-//   ap_int_base<W,S> y {1};
-
-/// from all c types.
-#define CTOR_FROM_INT(Type, Size, Signed) \
-  INLINE ap_int_base(const Type op) { Base::V = op; }
-
-  CTOR_FROM_INT(bool, 1, false)
-  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  CTOR_FROM_INT(signed char, 8, true)
-  CTOR_FROM_INT(unsigned char, 8, false)
-  CTOR_FROM_INT(short, _AP_SIZE_short, true)
-  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
-  CTOR_FROM_INT(int, _AP_SIZE_int, true)
-  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
-  CTOR_FROM_INT(long, _AP_SIZE_long, true)
-  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
-  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
-  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-#undef CTOR_FROM_INT
-
-#if _AP_ENABLE_HALF_ == 1
-  /// ctor from half.
-  //  TODO optimize
-  INLINE ap_int_base(half op) {
-    ap_int_base<_AP_W, _AP_S> t((float)op);
-    Base::V = t.V;
-  }
-#endif
-
-  /// ctor from float.
-  INLINE ap_int_base(float op) {
-    const int BITS = FLOAT_MAN + FLOAT_EXP + 1;
-    ap_int_base<BITS, false> reg;
-    reg.V = floatToRawBits(op);
-    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
-
-    ap_int_base<FLOAT_EXP + 1, true> exp = 0;
-    exp.V = _AP_ROOT_op_get_range(reg.V, FLOAT_MAN, BITS - 2);
-    exp = exp - FLOAT_BIAS;
-
-    ap_int_base<FLOAT_MAN + 2, true> man;
-    man.V = _AP_ROOT_op_get_range(reg.V, 0, FLOAT_MAN - 1);
-    // check for NaN
-    _AP_WARNING(exp == ((unsigned char)(FLOAT_BIAS + 1)) && man.V != 0,
-                "assign NaN to ap integer value");
-    // set leading 1.
-    man.V = _AP_ROOT_op_set_bit(man.V, FLOAT_MAN, 1);
-    //if (is_neg) man = -man;
-
-    if ((reg.V & 0x7ffffffful) == 0) {
-      Base::V = 0;
-    } else {
-      int sh_amt = FLOAT_MAN - exp.V;
-      if (sh_amt == 0) {
-        Base::V = man.V;
-      } else if (sh_amt > 0) {
-        if (sh_amt < FLOAT_MAN + 2) {
-          Base::V = man.V >> sh_amt;
-        } else {
-          if (is_neg)
-            Base::V = -1;
-          else
-            Base::V = 0;
-        }
-      } else {
-        sh_amt = -sh_amt;
-        if (sh_amt < _AP_W) {
-          Base::V = man.V;
-          Base::V <<= sh_amt;
-        } else {
-          Base::V = 0;
-        }
-      }
-    }
-    if (is_neg) *this = -(*this);
-  }
-
-  /// ctor from double.
-  INLINE ap_int_base(double op) {
-    const int BITS = DOUBLE_MAN + DOUBLE_EXP + 1;
-    ap_int_base<BITS, false> reg;
-    reg.V = doubleToRawBits(op);
-    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
-
-    ap_int_base<DOUBLE_EXP + 1, true> exp = 0;
-    exp.V = _AP_ROOT_op_get_range(reg.V, DOUBLE_MAN, BITS - 2);
-    exp = exp - DOUBLE_BIAS;
-
-    ap_int_base<DOUBLE_MAN + 2, true> man;
-    man.V = _AP_ROOT_op_get_range(reg.V, 0, DOUBLE_MAN - 1);
-    // check for NaN
-    _AP_WARNING(exp == ((unsigned char)(DOUBLE_BIAS + 1)) && man.V != 0,
-                "assign NaN to ap integer value");
-    // set leading 1.
-    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
-    //if (is_neg) man = -man;
-
-    if ((reg.V & 0x7fffffffffffffffull) == 0) {
-      Base::V = 0;
-    } else {
-      int sh_amt = DOUBLE_MAN - exp.V;
-      if (sh_amt == 0) {
-        Base::V = man.V;
-      } else if (sh_amt > 0) {
-        if (sh_amt < DOUBLE_MAN + 2) {
-          Base::V = man.V >> sh_amt;
-        } else {
-          if (is_neg)
-            Base::V = -1;
-          else
-            Base::V = 0;
-        }
-      } else {
-        sh_amt = -sh_amt;
-        if (sh_amt < _AP_W) {
-          Base::V = man.V;
-          Base::V <<= sh_amt;
-        } else {
-          Base::V = 0;
-        }
-      }
-    }
-    if (is_neg) *this = -(*this);
-  }
-
-  /// from higer rank type.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = op.to_ap_int_base().V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const ap_range_ref<_AP_W2, _AP_S2>& ref) {
-    Base::V = (ref.get()).V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const ap_bit_ref<_AP_W2, _AP_S2>& ref) {
-    Base::V = ref.operator bool();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
-    const ap_int_base<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>::_AP_WR,
-                      false>
-        tmp = ref.get();
-    Base::V = tmp.V;
-  }
-
-  /* radix has default value in set */
-
-#ifndef __SYNTHESIS__
-  INLINE ap_int_base(const char* s, signed char rd = 0) {
-    if (rd == 0)
-      rd = guess_radix(s);
-    unsigned int length = strlen(s);
-    Base::V.fromString(s, length, rd);
-  }
-#else
-  // XXX __builtin_bit_from_string(...) requires const C string and radix.
-  INLINE ap_int_base(const char* s) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_W, _AP_S,
-                      AP_TRN, AP_WRAP, 0, _AP_C99);
-    Base::V = t;
-  }
-  INLINE ap_int_base(const char* s, signed char rd) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_W, _AP_S,
-                      AP_TRN, AP_WRAP, 0, _AP_C99);
-    Base::V = t;
-  }
-#endif
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    Base::V = (val.get()).V;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    Base::V = val.operator bool();
-  }
-
-  INLINE ap_int_base read() volatile {
-    /*AP_DEBUG(printf("call read %d\n", Base::V););*/
-    ap_int_base ret;
-    ret.V = Base::V;
-    return ret;
-  }
-
-  INLINE void write(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
-    /*AP_DEBUG(printf("call write %d\n", op2.V););*/
-    Base::V = op2.V;
-  }
-
-  /* Another form of "write".*/
-  template <int _AP_W2, bool _AP_S2>
-  INLINE void operator=(
-      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  INLINE void operator=(
-      const volatile ap_int_base<_AP_W, _AP_S>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE void operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  INLINE void operator=(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(
-      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  INLINE ap_int_base& operator=(const volatile ap_int_base<_AP_W, _AP_S>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  INLINE ap_int_base& operator=(const ap_int_base<_AP_W, _AP_S>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-
-#define ASSIGN_OP_FROM_INT(Type, Size, Signed) \
-  INLINE ap_int_base& operator=(Type op) {     \
-    Base::V = op;                              \
-    return *this;                              \
-  }
-
-  ASSIGN_OP_FROM_INT(bool, 1, false)
-  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  ASSIGN_OP_FROM_INT(signed char, 8, true)
-  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
-  ASSIGN_OP_FROM_INT(short, _AP_SIZE_short, true)
-  ASSIGN_OP_FROM_INT(unsigned short, _AP_SIZE_short, false)
-  ASSIGN_OP_FROM_INT(int, _AP_SIZE_int, true)
-  ASSIGN_OP_FROM_INT(unsigned int, _AP_SIZE_int, false)
-  ASSIGN_OP_FROM_INT(long, _AP_SIZE_long, true)
-  ASSIGN_OP_FROM_INT(unsigned long, _AP_SIZE_long, false)
-  ASSIGN_OP_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
-  ASSIGN_OP_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef ASSIGN_OP_FROM_INT
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& op2) {
-    Base::V = (bool)op2;
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    Base::V = (ap_int_base<_AP_W2, false>(op2)).V;
-    return *this;
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op2) {
-    Base::V = op2.get().V;
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = op.to_ap_int_base().V;
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = (bool)op;
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = ((const ap_int_base<_AP_W2, false>)(op)).V;
-    return *this;
-  }
-
-  // FIXME: UG902 has clearly required user to use to_int() to convert to built-in
-  // types, but this implicit conversion is relied on in hls_cordic.h and hls_rsr.h.
-  // For example:
-  //     int d_exp = fps_x.exp - fps_y.exp;
-  INLINE operator RetType() const { return (RetType)(Base::V); }
-
-  /* Explicit conversions to C types.
-   * ----------------------------------------------------------------
-   */
-  INLINE bool to_bool() const { return (bool)(Base::V); }
-  INLINE char to_char() const { return (char)(Base::V); }
-  INLINE signed char to_schar() const { return (signed char)(Base::V); }
-  INLINE unsigned char to_uchar() const { return (unsigned char)(Base::V); }
-  INLINE short to_short() const { return (short)(Base::V); }
-  INLINE unsigned short to_ushort() const { return (unsigned short)(Base::V); }
-  INLINE int to_int() const { return (int)(Base::V); }
-  INLINE unsigned to_uint() const { return (unsigned)(Base::V); }
-  INLINE long to_long() const { return (long)(Base::V); }
-  INLINE unsigned long to_ulong() const { return (unsigned long)(Base::V); }
-  INLINE ap_slong to_int64() const { return (ap_slong)(Base::V); }
-  INLINE ap_ulong to_uint64() const { return (ap_ulong)(Base::V); }
-  INLINE float to_float() const { return (float)(Base::V); }
-  INLINE double to_double() const { return (double)(Base::V); }
-
-  // TODO decide if user-defined conversion should be provided.
-#if 0
-  INLINE operator char() const { return (char)(Base::V); }
-  INLINE operator signed char() const { return (signed char)(Base::V); }
-  INLINE operator unsigned char() const { return (unsigned char)(Base::V); }
-  INLINE operator short() const { return (short)(Base::V); }
-  INLINE operator unsigned short() const { return (unsigned short)(Base::V); }
-  INLINE operator int() const { return (int)(Base::V); }
-  INLINE operator unsigned int () const { return (unsigned)(Base::V); }
-  INLINE operator long () const { return (long)(Base::V); }
-  INLINE operator unsigned long () const { return (unsigned long)(Base::V); }
-  INLINE operator ap_slong () { return (ap_slong)(Base::V); }
-  INLINE operator ap_ulong () { return (ap_ulong)(Base::V); }
-#endif
-
-  /* Helper methods.
-     ----------------------------------------------------------------
-  */
-  /* we cannot call a non-volatile function on a volatile instance.
-   * but calling a volatile function is ok.
-   * XXX deleted non-volatile version.
-   */
-  INLINE int length() const volatile { return _AP_W; }
-
-  /*Return true if the value of ap_int_base instance is zero*/
-  INLINE bool iszero() const { return Base::V == 0; }
-
-  /*Return true if the value of ap_int_base instance is zero*/
-  INLINE bool is_zero() const { return Base::V == 0; }
-
-  /* x < 0 */
-  INLINE bool sign() const {
-    if (_AP_S &&
-        _AP_ROOT_op_get_bit(Base::V, _AP_W - 1))
-      return true;
-    else
-      return false;
-  }
-
-  /* x[i] = 0 */
-  INLINE void clear(int i) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
-  }
-
-  /* x[i] = !x[i]*/
-  INLINE void invert(int i) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    bool val = _AP_ROOT_op_get_bit(Base::V, i);
-    if (val)
-      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
-    else
-      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
-  }
-
-  INLINE bool test(int i) const {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    return _AP_ROOT_op_get_bit(Base::V, i);
-  }
-
-  // Get self. For ap_concat_ref expansion.
-  INLINE ap_int_base& get() { return *this; }
-
-  // Set the ith bit into 1
-  INLINE void set(int i) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
-  }
-
-  // Set the ith bit into v
-  INLINE void set(int i, bool v) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_int_base object n places to the left
-  INLINE ap_int_base& lrotate(int n) {
-    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
-    // TODO unify this.
-#ifdef __SYNTHESIS__
-    typeof(Base::V) l_p = Base::V << n;
-    typeof(Base::V) r_p = Base::V >> (_AP_W - n);
-    Base::V = l_p | r_p;
-#else
-    Base::V.lrotate(n);
-#endif
-    return *this;
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_int_base object n places to the right
-  INLINE ap_int_base& rrotate(int n) {
-    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
-    // TODO unify this.
-#ifdef __SYNTHESIS__
-    typeof(Base::V) l_p = Base::V << (_AP_W - n);
-    typeof(Base::V) r_p = Base::V >> n;
-    Base::V = l_p | r_p;
-#else
-    Base::V.rrotate(n);
-#endif
-    return *this;
-  }
-
-  // Reverse the contents of ap_int_base instance.
-  // I.e. LSB becomes MSB and vise versa.
-  INLINE ap_int_base& reverse() {
-    Base::V = _AP_ROOT_op_get_range(Base::V, _AP_W - 1, 0);
-    return *this;
-  }
-
-  // Set the ith bit into v
-  INLINE void set_bit(int i, bool v) {
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
-  }
-
-  // Get the value of ith bit
-  INLINE bool get_bit(int i) const {
-    return (bool)_AP_ROOT_op_get_bit(Base::V, i);
-  }
-
-  // complements every bit
-  INLINE void b_not() { Base::V = ~Base::V; }
-
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    Base::V Sym op2.V;                                                       \
-    return *this;                                                            \
-  }
-
-  /* Arithmetic assign.
-   * ----------------------------------------------------------------
-   */
-  OP_ASSIGN_AP(*=)
-  OP_ASSIGN_AP(+=)
-  OP_ASSIGN_AP(-=)
-  OP_ASSIGN_AP(/=)
-  OP_ASSIGN_AP(%=)
-#undef OP_ASSIGN_AP
-
-  /* Bitwise assign: and, or, xor.
-   * ----------------------------------------------------------------
-   */
-#define OP_ASSIGN_AP_CHK(Sym)                                                \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    _AP_WARNING((_AP_W != _AP_W2),                                           \
-                "Bitsize mismatch for ap_[u]int" #Sym "ap_[u]int.");         \
-    Base::V Sym op2.V;                                                       \
-    return *this;                                                            \
-  }
-  OP_ASSIGN_AP_CHK(&=)
-  OP_ASSIGN_AP_CHK(|=)
-  OP_ASSIGN_AP_CHK(^=)
-#undef OP_ASSIGN_AP_CHK
-
-  /* Prefix increment, decrement.
-   * ----------------------------------------------------------------
-   */
-  INLINE ap_int_base& operator++() {
-    operator+=((ap_int_base<1, false>)1);
-    return *this;
-  }
-  INLINE ap_int_base& operator--() {
-    operator-=((ap_int_base<1, false>)1);
-    return *this;
-  }
-
-  /* Postfix increment, decrement
-   * ----------------------------------------------------------------
-   */
-  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator++(int) {
-    ap_int_base t = *this;
-    operator+=((ap_int_base<1, false>)1);
-    return t;
-  }
-  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator--(int) {
-    ap_int_base t = *this;
-    operator-=((ap_int_base<1, false>)1);
-    return t;
-  }
-
-  /* Unary arithmetic.
-   * ----------------------------------------------------------------
-   */
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator+() const { return *this; }
-
-  // TODO used to be W>64 only... need check.
-  INLINE typename RType<1, false>::minus operator-() const {
-    return ap_int_base<1, false>(0) - *this;
-  }
-
-  /* Not (!)
-   * ----------------------------------------------------------------
-   */
-  INLINE bool operator!() const { return Base::V == 0; }
-
-  /* Bitwise (arithmetic) unary: complement
-     ----------------------------------------------------------------
-  */
-  // XXX different from Mentor's ac_int!
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator~() const {
-    ap_int_base<_AP_W, _AP_S> r;
-    r.V = ~Base::V;
-    return r;
-  }
-
-  /* Shift (result constrained by left operand).
-   * ----------------------------------------------------------------
-   */
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, true>& op2) const {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator>>(sh);
-    } else
-      return operator<<(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, false>& op2) const {
-    ap_int_base r;
-    r.V = Base::V << op2.to_uint();
-    return r;
-  }
-
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, true>& op2) const {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator<<(sh);
-    }
-    return operator>>(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, false>& op2) const {
-    ap_int_base r;
-    r.V = Base::V >> op2.to_uint();
-    return r;
-  }
-
-  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
-#if 0
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base operator<<(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
-    return *this << (op2.operator ap_int_base<_AP_W2, false>());
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base operator>>(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
-    return *this >> (op2.operator ap_int_base<_AP_W2, false>());
-  }
-#endif
-
-  /* Shift assign
-   * ----------------------------------------------------------------
-   */
-  template <int _AP_W2>
-  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, true>& op2) {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator>>=(sh);
-    } else
-      return operator<<=(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, false>& op2) {
-    Base::V <<= op2.to_uint();
-    return *this;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, true>& op2) {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator<<=(sh);
-    }
-    return operator>>=(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, false>& op2) {
-    Base::V >>= op2.to_uint();
-    return *this;
-  }
-
-  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
-#if 0
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator<<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return *this <<= (op2.operator ap_int_base<_AP_W2, false>());
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator>>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return *this >>= (op2.operator ap_int_base<_AP_W2, false>());
-  }
-#endif
-
-  /* Equality and Relational.
-   * ----------------------------------------------------------------
-   */
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V == op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return !(Base::V == op2.V);
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V < op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V >= op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V > op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V <= op2.V;
-  }
-
-  /* Bit and Part Select
-   * ----------------------------------------------------------------
-   */
-  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
-    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
-    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
-    return ap_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  // This is a must to strip constness to produce reference type.
-  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
-    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
-    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
-    return ap_range_ref<_AP_W, _AP_S>(const_cast<ap_int_base*>(this), Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> range() {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> range() const {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
-    return this->range(Hi, Lo);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-#if 0
-  template<int Hi, int Lo>
-  INLINE ap_int_base<Hi-Lo+1, false> slice() const {
-    AP_ASSERT(Hi >= Lo && Hi < _AP_W && Lo < _AP_W, "Out of bounds in slice()");
-    ap_int_base<Hi-Lo+1, false> tmp ;
-    tmp.V = _AP_ROOT_op_get_range(Base::V, Lo, Hi);
-    return tmp;
-  }
-
-  INLINE ap_bit_ref<_AP_W,_AP_S> operator [] ( unsigned int uindex) {
-    AP_ASSERT(uindex < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W,_AP_S> bvh( this, uindex );
-    return bvh;
-  }
-#endif
-
-  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](int index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
-    return bvh;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
-    return bvh;
-  }
-
-  INLINE bool operator[](int index) const {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> br(this, index);
-    return br.to_bool();
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator[](const ap_int_base<_AP_W2, _AP_S2>& index) const {
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> br(this, index.to_int());
-    return br.to_bool();
-  }
-
-  INLINE ap_bit_ref<_AP_W, _AP_S> bit(int index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
-    return bvh;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref<_AP_W, _AP_S> bit(
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
-    return bvh;
-  }
-
-  INLINE bool bit(int index) const {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> br(this, index);
-    return br.to_bool();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool bit(const ap_int_base<_AP_W2, _AP_S2>& index) const {
-    return bit(index.to_int());
-  }
-
-#if 0
-  template<typename _AP_T>
-  INLINE bool operator[](_AP_T index) const {
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W,_AP_S> br = operator[](index);
-    return br.to_bool();
-  }
-#endif
-
-  // Count the number of zeros from the most significant bit
-  // to the first one bit.
-  INLINE int countLeadingZeros() {
-#ifdef __SYNTHESIS__
-    if (_AP_W <= 32) {
-      ap_int_base<32, false> t(-1UL), x;
-      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
-      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
-      return __builtin_ctz(t.V); // count trailing zeros.
-    } else if (_AP_W <= 64) {
-      ap_int_base<64, false> t(-1ULL);
-      ap_int_base<64, false> x;
-      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
-      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
-      return __builtin_ctzll(t.V); // count trailing zeros.
-    } else {
-      enum { __N = (_AP_W + 63) / 64 };
-      int NZeros = 0;
-      int i = 0;
-      bool hitNonZero = false;
-      for (i = 0; i < __N - 1; ++i) {
-        ap_int_base<64, false> t;
-        t.V = _AP_ROOT_op_get_range(this->V, _AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
-        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); // count leading zeros.
-        hitNonZero |= (t.V != 0);
-      }
-      if (!hitNonZero) {
-        ap_int_base<64, false> t(-1ULL);
-        enum { REST = (_AP_W - 1) % 64 };
-        ap_int_base<64, false> x;
-        x.V = _AP_ROOT_op_get_range(this->V, 0, REST);
-        t.V = _AP_ROOT_op_set_range(t.V, 63 - REST, 63, x.V);
-        NZeros += __builtin_clzll(t.V);
-      }
-      return NZeros;
-    }
-#else
-    return (Base::V).countLeadingZeros();
-#endif
-  } // countLeadingZeros
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  concat(const ap_int_base<_AP_W2, _AP_S2>& a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  concat(ap_int_base<_AP_W2, _AP_S2>& a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-      operator,(ap_range_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this), a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(ap_bit_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, a2);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-                                                                         a2);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      _AP_W, ap_int_base, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                &a2) const {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<
-            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      _AP_W, ap_int_base, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-                                                                       a2);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                    &a2) const {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(
-          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-    return *this & a2.get();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-    return *this | a2.get();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-    return *this ^ a2.get();
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
-    Base::V = val.V;
-  }
-
-  /* Reduce operations.
-   * ----------------------------------------------------------------
-   */
-  // XXX non-const version deleted.
-  INLINE bool and_reduce() const { return _AP_ROOT_op_reduce(and, Base::V); }
-  INLINE bool nand_reduce() const { return _AP_ROOT_op_reduce(nand, Base::V); }
-  INLINE bool or_reduce() const { return _AP_ROOT_op_reduce(or, Base::V); }
-  INLINE bool nor_reduce() const { return !(_AP_ROOT_op_reduce(or, Base::V)); }
-  INLINE bool xor_reduce() const { return _AP_ROOT_op_reduce (xor, Base::V); }
-  INLINE bool xnor_reduce() const {
-    return !(_AP_ROOT_op_reduce (xor, Base::V));
-  }
-
-  /* Output as a string.
-   * ----------------------------------------------------------------
-   */
-#ifndef __SYNTHESIS__
-  std::string to_string(signed char rd = 2, bool sign = _AP_S) const {
-    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
-    // initialize sc_lv, which seems incapable of handling format "-0b".
-    if (rd == 2) sign = false;
-    return (Base::V).to_string(rd, sign);
-  }
-#else
-  INLINE char* to_string(signed char rd = 2, bool sign = _AP_S) const {
-    return 0;
-  }
-#endif
-}; // struct ap_int_base
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::ostream& operator<<(std::ostream& os,
-                                const ap_int_base<_AP_W, _AP_S>& x) {
-  std::ios_base::fmtflags ff = std::cout.flags();
-  if (ff & std::cout.hex) {
-    os << x.to_string(16); // don't print sign
-  } else if (ff & std::cout.oct) {
-    os << x.to_string(8); // don't print sign
-  } else {
-    os << x.to_string(10);
-  }
-  return os;
-}
-#endif // ifndef __SYNTHESIS__
-
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::istream& operator>>(std::istream& in,
-                                ap_int_base<_AP_W, _AP_S>& op) {
-  std::string str;
-  in >> str;
-  const std::ios_base::fmtflags basefield = in.flags() & std::ios_base::basefield;
-  unsigned radix = (basefield == std::ios_base::dec) ? 0 : (
-                     (basefield == std::ios_base::oct) ? 8 : (
-                       (basefield == std::ios_base::hex) ? 16 : 0));
-  op = ap_int_base<_AP_W, _AP_S>(str.c_str(), radix);
-  return in;
-}
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_AUTOCC
-
-/* Operators with another ap_int_base.
- * ----------------------------------------------------------------
- */
-#define OP_BIN_AP(Sym, Rty)                                                   \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
-  INLINE                                                                      \
-      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
-      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
-                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base lhs(op);                                  \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base rhs(op2);                                 \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
-    ret.V = lhs.V Sym rhs.V;                                                  \
-    return ret;                                                               \
-  }
-
-OP_BIN_AP(*, mult)
-OP_BIN_AP(+, plus)
-OP_BIN_AP(-, minus)
-OP_BIN_AP(&, logic)
-OP_BIN_AP(|, logic)
-OP_BIN_AP(^, logic)
-
-#define OP_BIN_AP2(Sym, Rty)                                                  \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
-  INLINE                                                                      \
-      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
-      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
-                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
-    ret.V = op.V Sym op2.V;                                                   \
-    return ret;                                                               \
-  }
-
-OP_BIN_AP2(/, div)
-OP_BIN_AP2(%, mod)
-
-// shift operators are defined inside class.
-// compound assignment operators are defined inside class.
-
-/* Operators with a pointer type.
- * ----------------------------------------------------------------
- *   char a[100];
- *   char* ptr = a;
- *   ap_int<2> n = 3;
- *   char* ptr2 = ptr + n*2;
- * avoid ambiguous errors.
- */
-#define OP_BIN_WITH_PTR(BIN_OP)                                           \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
-  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                        \
-                                   const ap_int_base<_AP_W, _AP_S>& op) { \
-    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
-    return i_op BIN_OP op2;                                               \
-  }                                                                       \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
-  INLINE PTR_TYPE* operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
-                                   PTR_TYPE* i_op) {                      \
-    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
-    return op2 BIN_OP i_op;                                               \
-  }
-
-OP_BIN_WITH_PTR(+)
-OP_BIN_WITH_PTR(-)
-
-/* Operators with a native floating point types.
- * ----------------------------------------------------------------
- */
-// float OP ap_int
-// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
-#define OP_BIN_WITH_FLOAT(BIN_OP, C_TYPE)                              \
-  template <int _AP_W, bool _AP_S>                                     \
-  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                           \
-                                const ap_int_base<_AP_W, _AP_S>& op) { \
-    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
-    return i_op BIN_OP op2;                                            \
-  }                                                                    \
-  template <int _AP_W, bool _AP_S>                                     \
-  INLINE C_TYPE operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
-                                C_TYPE i_op) {                         \
-    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
-    return op2 BIN_OP i_op;                                            \
-  }
-
-#define ALL_OP_WITH_FLOAT(C_TYPE) \
-  OP_BIN_WITH_FLOAT(*, C_TYPE) \
-  OP_BIN_WITH_FLOAT(/, C_TYPE) \
-  OP_BIN_WITH_FLOAT(+, C_TYPE) \
-  OP_BIN_WITH_FLOAT(-, C_TYPE)
-
-#if _AP_ENABLE_HALF_ == 1
-ALL_OP_WITH_FLOAT(half)
-#endif
-ALL_OP_WITH_FLOAT(float)
-ALL_OP_WITH_FLOAT(double)
-
-// TODO no shift?
-
-/* Operators with a native integral types.
- * ----------------------------------------------------------------
- */
-// arithmetic and bitwise operators.
-#define OP_BIN_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)             \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
-                                                            _AP_S2>::RTYPE \
-  operator BIN_OP(C_TYPE i_op, const ap_int_base<_AP_W, _AP_S>& op) {      \
-    return ap_int_base<_AP_W2, _AP_S2>(i_op) BIN_OP(op);                   \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
-                                                            _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, C_TYPE i_op) {      \
-    return op BIN_OP ap_int_base<_AP_W2, _AP_S2>(i_op);                    \
-  }
-
-#define ALL_OP_BIN_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
-  OP_BIN_WITH_INT(*, C_TYPE, _AP_W2, _AP_S2, mult)  \
-  OP_BIN_WITH_INT(+, C_TYPE, _AP_W2, _AP_S2, plus)  \
-  OP_BIN_WITH_INT(-, C_TYPE, _AP_W2, _AP_S2, minus) \
-  OP_BIN_WITH_INT(/, C_TYPE, _AP_W2, _AP_S2, div)   \
-  OP_BIN_WITH_INT(%, C_TYPE, _AP_W2, _AP_S2, mod)   \
-  OP_BIN_WITH_INT(&, C_TYPE, _AP_W2, _AP_S2, logic) \
-  OP_BIN_WITH_INT(|, C_TYPE, _AP_W2, _AP_S2, logic) \
-  OP_BIN_WITH_INT(^, C_TYPE, _AP_W2, _AP_S2, logic)
-
-ALL_OP_BIN_WITH_INT(bool, 1, false)
-ALL_OP_BIN_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_BIN_WITH_INT(signed char, 8, true)
-ALL_OP_BIN_WITH_INT(unsigned char, 8, false)
-ALL_OP_BIN_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_BIN_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_BIN_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_BIN_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_BIN_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_BIN_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_BIN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_OP_BIN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef OP_BIN_WITH_INT
-#undef ALL_OP_BIN_WITH_INT
-
-// shift operators.
-#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    if (_AP_S2)                                          \
-      r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); \
-    else                                                 \
-      r.V = op.V << op2;                                 \
-    return r;                                            \
-  }                                                      \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    if (_AP_S2)                                          \
-      r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); \
-    else                                                 \
-      r.V = op.V >> op2;                                 \
-    return r;                                            \
-  }
-
-ALL_OP_SHIFT_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_SHIFT_WITH_INT(signed char, 8, true)
-ALL_OP_SHIFT_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_SHIFT_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_SHIFT_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_SHIFT_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-
-#undef ALL_OP_SHIFT_WITH_INT
-
-#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    r.V = op.V << op2;                                   \
-    return r;                                            \
-  }                                                      \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    r.V = op.V >> op2;                                   \
-    return r;                                            \
-  }
-ALL_OP_SHIFT_WITH_INT(bool, 1, false)
-ALL_OP_SHIFT_WITH_INT(unsigned char, 8, false)
-ALL_OP_SHIFT_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_SHIFT_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_SHIFT_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_SHIFT_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef ALL_OP_SHIFT_WITH_INT
-
-// compound assign operators.
-#define OP_ASSIGN_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE ap_int_base<_AP_W, _AP_S>& operator ASSIGN_OP(             \
-      ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
-    return op ASSIGN_OP ap_int_base<_AP_W2, _AP_S2>(op2);           \
-  }
-
-// TODO int a; ap_int<16> b; a += b;
-
-#define ALL_OP_ASSIGN_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
-  OP_ASSIGN_WITH_INT(+=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(-=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(*=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(/=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(%=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(&=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(|=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(^=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(>>=, C_TYPE, _AP_W2, _AP_S2)      \
-  OP_ASSIGN_WITH_INT(<<=, C_TYPE, _AP_W2, _AP_S2)
-
-ALL_OP_ASSIGN_WITH_INT(bool, 1, false)
-ALL_OP_ASSIGN_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_ASSIGN_WITH_INT(signed char, 8, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned char, 8, false)
-ALL_OP_ASSIGN_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_ASSIGN_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_ASSIGN_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_ASSIGN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_OP_ASSIGN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef OP_ASSIGN_WITH_INT
-#undef ALL_OP_ASSIGN_WITH_INT
-
-// equality and relational operators.
-#define OP_REL_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)              \
-  template <int _AP_W, bool _AP_S>                                   \
-  INLINE bool operator REL_OP(C_TYPE i_op,                           \
-                              const ap_int_base<_AP_W, _AP_S>& op) { \
-    return ap_int_base<_AP_W2, _AP_S2>(i_op) REL_OP op;              \
-  }                                                                  \
-  template <int _AP_W, bool _AP_S>                                   \
-  INLINE bool operator REL_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
-                              C_TYPE op2) {                          \
-    return op REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);               \
-  }
-
-#define ALL_OP_REL_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
-  OP_REL_WITH_INT(>, C_TYPE, _AP_W2, _AP_S2)        \
-  OP_REL_WITH_INT(<, C_TYPE, _AP_W2, _AP_S2)        \
-  OP_REL_WITH_INT(>=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_REL_WITH_INT(<=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_REL_WITH_INT(==, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_REL_WITH_INT(!=, C_TYPE, _AP_W2, _AP_S2)
-
-ALL_OP_REL_WITH_INT(bool, 1, false)
-ALL_OP_REL_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_REL_WITH_INT(signed char, 8, true)
-ALL_OP_REL_WITH_INT(unsigned char, 8, false)
-ALL_OP_REL_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_REL_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_REL_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_REL_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_REL_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_REL_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_REL_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_OP_REL_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef OP_REL_WITH_INT
-#undef ALL_OP_BIN_WITH_INT
-
-#define OP_REL_WITH_DOUBLE_OR_FLOAT(Sym)                            \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
-                           double op2) {                            \
-    return op1.to_double() Sym op2 ;                                \
-  }                                                                 \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(double op1,                              \
-                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
-    return op1 Sym op2.to_double() ;                                \
-  }                                                                 \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
-                           float op2) {                             \
-    return op1.to_double() Sym op2 ;                                \
-  }                                                                 \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(float op1,                               \
-                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
-    return op1 Sym op2.to_double() ;                                \
-  }
-  OP_REL_WITH_DOUBLE_OR_FLOAT(>)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(<)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(>=)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(<=)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(==)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(!=)
-
-#undef OP_REL_WITH_DOUBLE_OR_FLOAT
-
-
-/* Operators with ap_bit_ref.
- * ------------------------------------------------------------
- */
-// arithmetic, bitwise and shift operators.
-#define OP_BIN_WITH_RANGE(BIN_OP, RTYPE)                                     \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                              _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,                   \
-                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
-    return ap_int_base<_AP_W1, false>(op1) BIN_OP op2;                       \
-  }                                                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                              _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                    \
-                  const ap_range_ref<_AP_W2, _AP_S2>& op2) {                 \
-    return op1 BIN_OP ap_int_base<_AP_W2, false>(op2);                       \
-  }
-
-OP_BIN_WITH_RANGE(+, plus)
-OP_BIN_WITH_RANGE(-, minus)
-OP_BIN_WITH_RANGE(*, mult)
-OP_BIN_WITH_RANGE(/, div)
-OP_BIN_WITH_RANGE(%, mod)
-OP_BIN_WITH_RANGE(&, logic)
-OP_BIN_WITH_RANGE(|, logic)
-OP_BIN_WITH_RANGE(^, logic)
-OP_BIN_WITH_RANGE(>>, arg1)
-OP_BIN_WITH_RANGE(<<, arg1)
-
-#undef OP_BIN_WITH_RANGE
-
-// compound assignment operators.
-#define OP_ASSIGN_WITH_RANGE(ASSIGN_OP)                                      \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                    \
-      ap_int_base<_AP_W1, _AP_S1>& op1, ap_range_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1 ASSIGN_OP ap_int_base<_AP_W2, false>(op2);                    \
-  }                                                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE ap_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
-      ap_range_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    ap_int_base<_AP_W1, false> tmp(op1);                                     \
-    tmp ASSIGN_OP op2;                                                       \
-    op1 = tmp;                                                               \
-    return op1;                                                              \
-  }
-
-OP_ASSIGN_WITH_RANGE(+=)
-OP_ASSIGN_WITH_RANGE(-=)
-OP_ASSIGN_WITH_RANGE(*=)
-OP_ASSIGN_WITH_RANGE(/=)
-OP_ASSIGN_WITH_RANGE(%=)
-OP_ASSIGN_WITH_RANGE(&=)
-OP_ASSIGN_WITH_RANGE(|=)
-OP_ASSIGN_WITH_RANGE(^=)
-OP_ASSIGN_WITH_RANGE(>>=)
-OP_ASSIGN_WITH_RANGE(<<=)
-
-#undef OP_ASSIGN_WITH_RANGE
-
-// equality and relational operators
-#define OP_REL_WITH_RANGE(REL_OP)                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE bool operator REL_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,     \
-                              const ap_int_base<_AP_W2, _AP_S2>& op2) {    \
-    return ap_int_base<_AP_W1, false>(op1).operator REL_OP(op2);           \
-  }                                                                        \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,      \
-                              const ap_range_ref<_AP_W2, _AP_S2>& op2) {   \
-    return op1.operator REL_OP(op2.operator ap_int_base<_AP_W2, false>()); \
-  }
-
-OP_REL_WITH_RANGE(==)
-OP_REL_WITH_RANGE(!=)
-OP_REL_WITH_RANGE(>)
-OP_REL_WITH_RANGE(>=)
-OP_REL_WITH_RANGE(<)
-OP_REL_WITH_RANGE(<=)
-
-#undef OP_REL_WITH_RANGE
-
-/* Operators with ap_bit_ref.
- * ------------------------------------------------------------
- */
-// arithmetic, bitwise and shift operators.
-#define OP_BIN_WITH_BIT(BIN_OP, RTYPE)                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
-  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                      \
-                  const ap_bit_ref<_AP_W2, _AP_S2>& op2) {                     \
-    return op1 BIN_OP ap_int_base<1, false>(op2);                              \
-  }                                                                            \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,                       \
-                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                    \
-    return ap_int_base<1, false>(op1) BIN_OP op2;                              \
-  }
-
-OP_BIN_WITH_BIT(+, plus)
-OP_BIN_WITH_BIT(-, minus)
-OP_BIN_WITH_BIT(*, mult)
-OP_BIN_WITH_BIT(/, div)
-OP_BIN_WITH_BIT(%, mod)
-OP_BIN_WITH_BIT(&, logic)
-OP_BIN_WITH_BIT(|, logic)
-OP_BIN_WITH_BIT(^, logic)
-OP_BIN_WITH_BIT(>>, arg1)
-OP_BIN_WITH_BIT(<<, arg1)
-
-#undef OP_BIN_WITH_BIT
-
-// compound assignment operators.
-#define OP_ASSIGN_WITH_BIT(ASSIGN_OP)                                      \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                  \
-      ap_int_base<_AP_W1, _AP_S1>& op1, ap_bit_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1 ASSIGN_OP ap_int_base<1, false>(op2);                       \
-  }                                                                        \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE ap_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
-      ap_bit_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    ap_int_base<1, false> tmp(op1);                                        \
-    tmp ASSIGN_OP op2;                                                     \
-    op1 = tmp;                                                             \
-    return op1;                                                            \
-  }
-
-OP_ASSIGN_WITH_BIT(+=)
-OP_ASSIGN_WITH_BIT(-=)
-OP_ASSIGN_WITH_BIT(*=)
-OP_ASSIGN_WITH_BIT(/=)
-OP_ASSIGN_WITH_BIT(%=)
-OP_ASSIGN_WITH_BIT(&=)
-OP_ASSIGN_WITH_BIT(|=)
-OP_ASSIGN_WITH_BIT(^=)
-OP_ASSIGN_WITH_BIT(>>=)
-OP_ASSIGN_WITH_BIT(<<=)
-
-#undef OP_ASSIGN_WITH_BIT
-
-// equality and relational operators.
-#define OP_REL_WITH_BIT(REL_OP)                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
-  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,   \
-                              const ap_bit_ref<_AP_W2, _AP_S2>& op2) {  \
-    return op1 REL_OP ap_int_base<1, false>(op2);                       \
-  }                                                                     \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
-  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,    \
-                              const ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    return ap_int_base<1, false>(op1) REL_OP op2;                       \
-  }
-
-OP_REL_WITH_BIT(==)
-OP_REL_WITH_BIT(!=)
-OP_REL_WITH_BIT(>)
-OP_REL_WITH_BIT(>=)
-OP_REL_WITH_BIT(<)
-OP_REL_WITH_BIT(<=)
-
-#undef OP_REL_WITH_BIT
-
-
-/* Operators with ap_concat_ref.
- * ------------------------------------------------------------
- */
-// arithmetic, bitwise and shift operators.
-// bitwise operators are defined in struct.
-// TODO specify whether to define arithmetic and bitwise operators.
-#if 0
-#define OP_BIN_WITH_CONCAT(BIN_OP, RTYPE)                                      \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
-                                                              false>::RTYPE    \
-  operator BIN_OP(const ap_int_base<_AP_W3, _AP_S3>& op1,                      \
-                  const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {  \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    return op1 BIN_OP op2.get();                                               \
-  }                                                                            \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
-                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
-  operator BIN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,    \
-                  const ap_int_base<_AP_W3, _AP_S3>& op2) {                    \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    return op1.get() BIN_OP op2;                                               \
-  }
-
-OP_BIN_WITH_CONCAT(+, plus)
-OP_BIN_WITH_CONCAT(-, minus)
-OP_BIN_WITH_CONCAT(*, mult)
-OP_BIN_WITH_CONCAT(/, div)
-OP_BIN_WITH_CONCAT(%, mod)
-OP_BIN_WITH_CONCAT(&, logic)
-OP_BIN_WITH_CONCAT(|, logic)
-OP_BIN_WITH_CONCAT(^, logic)
-OP_BIN_WITH_CONCAT(>>, arg1)
-OP_BIN_WITH_CONCAT(<<, arg1)
-
-#undef OP_BIN_WITH_CONCAT
-
-// compound assignment operators.
-#define OP_ASSIGN_WITH_CONCAT(ASSIGN_OP)                                       \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
-                                                              false>::RTYPE    \
-  operator ASSIGN_OP(                                                          \
-      const ap_int_base<_AP_W3, _AP_S3>& op1,                                  \
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {              \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    return op1 ASSIGN_OP op2.get();                                            \
-  }                                                                            \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
-                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
-  operator ASSIGN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \
-                     const ap_int_base<_AP_W3, _AP_S3>& op2) {                 \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    ap_int_base<_AP_W1 + _AP_W2, false> tmp = op1.get();                       \
-    tmp ASSIGN_OP op2;                                                         \
-    op1 = tmp;                                                                 \
-    return op1;                                                                \
-  }
-
-OP_ASSIGN_WITH_CONCAT(+=)
-OP_ASSIGN_WITH_CONCAT(-=)
-OP_ASSIGN_WITH_CONCAT(*=)
-OP_ASSIGN_WITH_CONCAT(/=)
-OP_ASSIGN_WITH_CONCAT(%=)
-OP_ASSIGN_WITH_CONCAT(&=)
-OP_ASSIGN_WITH_CONCAT(|=)
-OP_ASSIGN_WITH_CONCAT(^=)
-OP_ASSIGN_WITH_CONCAT(>>=)
-OP_ASSIGN_WITH_CONCAT(<<=)
-
-#undef OP_ASSIGN_WITH_CONCAT
-#endif
-
-// equality and relational operators.
-#define OP_REL_WITH_CONCAT(REL_OP)                                    \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
-            int _AP_W3, bool _AP_S3>                                  \
-  INLINE bool operator REL_OP(                                        \
-      const ap_int_base<_AP_W3, _AP_S3>& op1,                         \
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {     \
-    /* convert ap_concat_ref to ap_int_base */                        \
-    return op1 REL_OP op2.get();                                      \
-  }                                                                   \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
-            int _AP_W3, bool _AP_S3>                                  \
-  INLINE bool operator REL_OP(                                        \
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,       \
-      const ap_int_base<_AP_W3, _AP_S3>& op2) {                       \
-    /* convert ap_concat_ref to ap_int_base */                        \
-    return op1.get() REL_OP op2;                                      \
-  }
-
-OP_REL_WITH_CONCAT(==)
-OP_REL_WITH_CONCAT(!=)
-OP_REL_WITH_CONCAT(>)
-OP_REL_WITH_CONCAT(>=)
-OP_REL_WITH_CONCAT(<)
-OP_REL_WITH_CONCAT(<=)
-
-#undef OP_REL_WITH_CONCAT
-
-#endif // ifndef __cplusplus
-#endif // ifndef __AP_INT_BASE_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_BASE_H__
+#define __AP_INT_BASE_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+#else
+
+#include <ap_common.h>
+#ifndef __SYNTHESIS__
+#if _AP_ENABLE_HALF_ == 1
+#include <hls_half.h>
+#endif
+#include <iostream>
+#include <string.h>
+#endif
+
+/* ----------------------------------------------------------------
+ * ap_int_base: AutoPilot integer/Arbitrary precision integer.
+ * ----------------------------------------------------------------
+ */
+
+/* helper trait. Selecting the smallest C type that can hold the value,
+ * return 64 bit C type if not possible.
+ */
+template <int _AP_N, bool _AP_S>
+struct retval;
+
+// at least 64 bit
+template <int _AP_N>
+struct retval<_AP_N, true> {
+  typedef ap_slong Type;
+};
+
+template <int _AP_N>
+struct retval<_AP_N, false> {
+  typedef ap_ulong Type;
+};
+
+// at least 8 bit
+template <>
+struct retval<1, true> {
+  typedef signed char Type;
+};
+
+template <>
+struct retval<1, false> {
+  typedef unsigned char Type;
+};
+
+// at least 16 bit
+template <>
+struct retval<2, true> {
+  typedef short Type;
+};
+
+template <>
+struct retval<2, false> {
+  typedef unsigned short Type;
+};
+
+// at least 32 bit
+template <>
+struct retval<3, true> {
+  typedef long Type;
+};
+
+template <>
+struct retval<3, false> {
+  typedef unsigned long Type;
+};
+
+template <>
+struct retval<4, true> {
+  typedef long Type;
+};
+
+template <>
+struct retval<4, false> {
+  typedef unsigned long Type;
+};
+
+// trait for letting base class to return derived class.
+// Notice that derived class template is incomplete, and we cannot use
+// the member of the derived class.
+template <int _AP_W2, bool _AP_S2>
+struct _ap_int_factory;
+template <int _AP_W2>
+struct _ap_int_factory<_AP_W2,true> { typedef ap_int<_AP_W2> type; };
+template <int _AP_W2>
+struct _ap_int_factory<_AP_W2,false> { typedef ap_uint<_AP_W2> type; };
+
+template <int _AP_W, bool _AP_S>
+struct ap_int_base : public _AP_ROOT_TYPE<_AP_W, _AP_S> {
+ public:
+  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
+
+  /* ap_int_base<_AP_W, _AP_S, true>
+   * typedef typename retval<(_AP_W + 7) / 8, _AP_S>::Type RetType;
+   *
+   * ap_int_base<_AP_W, _AP_S, false>
+   * typedef typename retval<8, _AP_S>::Type RetType;
+   */
+  typedef typename retval<AP_MAX((_AP_W + 7) / 8, 8), _AP_S>::Type RetType;
+
+  static const int width = _AP_W;
+
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+
+
+    typedef ap_int_base<mult_w, mult_s> mult_base;
+    typedef ap_int_base<plus_w, plus_s> plus_base;
+    typedef ap_int_base<minus_w, minus_s> minus_base;
+    typedef ap_int_base<logic_w, logic_s> logic_base;
+    typedef ap_int_base<div_w, div_s> div_base;
+    typedef ap_int_base<mod_w, mod_s> mod_base;
+    typedef ap_int_base<_AP_W, _AP_S> arg1_base;
+
+    typedef typename _ap_int_factory<mult_w, mult_s>::type mult;
+    typedef typename _ap_int_factory<plus_w, plus_s>::type plus;
+    typedef typename _ap_int_factory<minus_w, minus_s>::type minus;
+    typedef typename _ap_int_factory<logic_w, logic_s>::type logic;
+    typedef typename _ap_int_factory<div_w, div_s>::type div;
+    typedef typename _ap_int_factory<mod_w, mod_s>::type mod;
+    typedef typename _ap_int_factory<_AP_W, _AP_S>::type arg1;
+    typedef bool reduce;
+  };
+
+  /* Constructors.
+   * ----------------------------------------------------------------
+   */
+  /// default ctor
+  INLINE ap_int_base() {
+    /*
+      #ifdef __SC_COMPATIBLE__
+      Base::V = 0;
+      #endif
+    */
+  }
+
+  /// copy ctor
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  /// volatile copy ctor
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+// XXX C++11 feature.
+// The explicit specifier specifies that a constructor or conversion function
+// (since C++11) doesn't allow implicit conversions or copy-initialization.
+//   ap_int_base<W,S> x = 1;
+//   ap_int_base<W,S> foo() { return 1; }
+// but allows
+//   ap_int_base<W,S> x(1);
+//   ap_int_base<W,S> y {1};
+
+/// from all c types.
+#define CTOR_FROM_INT(Type, Size, Signed) \
+  INLINE ap_int_base(const Type op) { Base::V = op; }
+
+  CTOR_FROM_INT(bool, 1, false)
+  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  CTOR_FROM_INT(signed char, 8, true)
+  CTOR_FROM_INT(unsigned char, 8, false)
+  CTOR_FROM_INT(short, _AP_SIZE_short, true)
+  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  CTOR_FROM_INT(int, _AP_SIZE_int, true)
+  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  CTOR_FROM_INT(long, _AP_SIZE_long, true)
+  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+#undef CTOR_FROM_INT
+
+#if _AP_ENABLE_HALF_ == 1
+  /// ctor from half.
+  //  TODO optimize
+  INLINE ap_int_base(half op) {
+    ap_int_base<_AP_W, _AP_S> t((float)op);
+    Base::V = t.V;
+  }
+#endif
+
+  /// ctor from float.
+  INLINE ap_int_base(float op) {
+    const int BITS = FLOAT_MAN + FLOAT_EXP + 1;
+    ap_int_base<BITS, false> reg;
+    reg.V = floatToRawBits(op);
+    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
+
+    ap_int_base<FLOAT_EXP + 1, true> exp = 0;
+    exp.V = _AP_ROOT_op_get_range(reg.V, FLOAT_MAN, BITS - 2);
+    exp = exp - FLOAT_BIAS;
+
+    ap_int_base<FLOAT_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(reg.V, 0, FLOAT_MAN - 1);
+    // check for NaN
+    _AP_WARNING(exp == ((unsigned char)(FLOAT_BIAS + 1)) && man.V != 0,
+                "assign NaN to ap integer value");
+    // set leading 1.
+    man.V = _AP_ROOT_op_set_bit(man.V, FLOAT_MAN, 1);
+    //if (is_neg) man = -man;
+
+    if ((reg.V & 0x7ffffffful) == 0) {
+      Base::V = 0;
+    } else {
+      int sh_amt = FLOAT_MAN - exp.V;
+      if (sh_amt == 0) {
+        Base::V = man.V;
+      } else if (sh_amt > 0) {
+        if (sh_amt < FLOAT_MAN + 2) {
+          Base::V = man.V >> sh_amt;
+        } else {
+          if (is_neg)
+            Base::V = -1;
+          else
+            Base::V = 0;
+        }
+      } else {
+        sh_amt = -sh_amt;
+        if (sh_amt < _AP_W) {
+          Base::V = man.V;
+          Base::V <<= sh_amt;
+        } else {
+          Base::V = 0;
+        }
+      }
+    }
+    if (is_neg) *this = -(*this);
+  }
+
+  /// ctor from double.
+  INLINE ap_int_base(double op) {
+    const int BITS = DOUBLE_MAN + DOUBLE_EXP + 1;
+    ap_int_base<BITS, false> reg;
+    reg.V = doubleToRawBits(op);
+    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
+
+    ap_int_base<DOUBLE_EXP + 1, true> exp = 0;
+    exp.V = _AP_ROOT_op_get_range(reg.V, DOUBLE_MAN, BITS - 2);
+    exp = exp - DOUBLE_BIAS;
+
+    ap_int_base<DOUBLE_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(reg.V, 0, DOUBLE_MAN - 1);
+    // check for NaN
+    _AP_WARNING(exp == ((unsigned char)(DOUBLE_BIAS + 1)) && man.V != 0,
+                "assign NaN to ap integer value");
+    // set leading 1.
+    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
+    //if (is_neg) man = -man;
+
+    if ((reg.V & 0x7fffffffffffffffull) == 0) {
+      Base::V = 0;
+    } else {
+      int sh_amt = DOUBLE_MAN - exp.V;
+      if (sh_amt == 0) {
+        Base::V = man.V;
+      } else if (sh_amt > 0) {
+        if (sh_amt < DOUBLE_MAN + 2) {
+          Base::V = man.V >> sh_amt;
+        } else {
+          if (is_neg)
+            Base::V = -1;
+          else
+            Base::V = 0;
+        }
+      } else {
+        sh_amt = -sh_amt;
+        if (sh_amt < _AP_W) {
+          Base::V = man.V;
+          Base::V <<= sh_amt;
+        } else {
+          Base::V = 0;
+        }
+      }
+    }
+    if (is_neg) *this = -(*this);
+  }
+
+  /// from higer rank type.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = op.to_ap_int_base().V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_range_ref<_AP_W2, _AP_S2>& ref) {
+    Base::V = (ref.get()).V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_bit_ref<_AP_W2, _AP_S2>& ref) {
+    Base::V = ref.operator bool();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+    const ap_int_base<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>::_AP_WR,
+                      false>
+        tmp = ref.get();
+    Base::V = tmp.V;
+  }
+
+  /* radix has default value in set */
+
+#ifndef __SYNTHESIS__
+  INLINE ap_int_base(const char* s, signed char rd = 0) {
+    if (rd == 0)
+      rd = guess_radix(s);
+    unsigned int length = strlen(s);
+    Base::V.fromString(s, length, rd);
+  }
+#else
+  // XXX __builtin_bit_from_string(...) requires const C string and radix.
+  INLINE ap_int_base(const char* s) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_W, _AP_S,
+                      AP_TRN, AP_WRAP, 0, _AP_C99);
+    Base::V = t;
+  }
+  INLINE ap_int_base(const char* s, signed char rd) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_W, _AP_S,
+                      AP_TRN, AP_WRAP, 0, _AP_C99);
+    Base::V = t;
+  }
+#endif
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    Base::V = (val.get()).V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    Base::V = val.operator bool();
+  }
+
+  INLINE ap_int_base read() volatile {
+    /*AP_DEBUG(printf("call read %d\n", Base::V););*/
+    ap_int_base ret;
+    ret.V = Base::V;
+    return ret;
+  }
+
+  INLINE void write(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    /*AP_DEBUG(printf("call write %d\n", op2.V););*/
+    Base::V = op2.V;
+  }
+
+  /* Another form of "write".*/
+  template <int _AP_W2, bool _AP_S2>
+  INLINE void operator=(
+      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  INLINE void operator=(
+      const volatile ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE void operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  INLINE void operator=(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(
+      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  INLINE ap_int_base& operator=(const volatile ap_int_base<_AP_W, _AP_S>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  INLINE ap_int_base& operator=(const ap_int_base<_AP_W, _AP_S>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+
+#define ASSIGN_OP_FROM_INT(Type, Size, Signed) \
+  INLINE ap_int_base& operator=(Type op) {     \
+    Base::V = op;                              \
+    return *this;                              \
+  }
+
+  ASSIGN_OP_FROM_INT(bool, 1, false)
+  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  ASSIGN_OP_FROM_INT(signed char, 8, true)
+  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
+  ASSIGN_OP_FROM_INT(short, _AP_SIZE_short, true)
+  ASSIGN_OP_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  ASSIGN_OP_FROM_INT(int, _AP_SIZE_int, true)
+  ASSIGN_OP_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  ASSIGN_OP_FROM_INT(long, _AP_SIZE_long, true)
+  ASSIGN_OP_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  ASSIGN_OP_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  ASSIGN_OP_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ASSIGN_OP_FROM_INT
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& op2) {
+    Base::V = (bool)op2;
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    Base::V = (ap_int_base<_AP_W2, false>(op2)).V;
+    return *this;
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op2) {
+    Base::V = op2.get().V;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = op.to_ap_int_base().V;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = (bool)op;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = ((const ap_int_base<_AP_W2, false>)(op)).V;
+    return *this;
+  }
+
+  // FIXME: UG902 has clearly required user to use to_int() to convert to built-in
+  // types, but this implicit conversion is relied on in hls_cordic.h and hls_rsr.h.
+  // For example:
+  //     int d_exp = fps_x.exp - fps_y.exp;
+  INLINE operator RetType() const { return (RetType)(Base::V); }
+
+  /* Explicit conversions to C types.
+   * ----------------------------------------------------------------
+   */
+  INLINE bool to_bool() const { return (bool)(Base::V); }
+  INLINE char to_char() const { return (char)(Base::V); }
+  INLINE signed char to_schar() const { return (signed char)(Base::V); }
+  INLINE unsigned char to_uchar() const { return (unsigned char)(Base::V); }
+  INLINE short to_short() const { return (short)(Base::V); }
+  INLINE unsigned short to_ushort() const { return (unsigned short)(Base::V); }
+  INLINE int to_int() const { return (int)(Base::V); }
+  INLINE unsigned to_uint() const { return (unsigned)(Base::V); }
+  INLINE long to_long() const { return (long)(Base::V); }
+  INLINE unsigned long to_ulong() const { return (unsigned long)(Base::V); }
+  INLINE ap_slong to_int64() const { return (ap_slong)(Base::V); }
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)(Base::V); }
+  INLINE float to_float() const { return (float)(Base::V); }
+  INLINE double to_double() const { return (double)(Base::V); }
+
+  // TODO decide if user-defined conversion should be provided.
+#if 0
+  INLINE operator char() const { return (char)(Base::V); }
+  INLINE operator signed char() const { return (signed char)(Base::V); }
+  INLINE operator unsigned char() const { return (unsigned char)(Base::V); }
+  INLINE operator short() const { return (short)(Base::V); }
+  INLINE operator unsigned short() const { return (unsigned short)(Base::V); }
+  INLINE operator int() const { return (int)(Base::V); }
+  INLINE operator unsigned int () const { return (unsigned)(Base::V); }
+  INLINE operator long () const { return (long)(Base::V); }
+  INLINE operator unsigned long () const { return (unsigned long)(Base::V); }
+  INLINE operator ap_slong () { return (ap_slong)(Base::V); }
+  INLINE operator ap_ulong () { return (ap_ulong)(Base::V); }
+#endif
+
+  /* Helper methods.
+     ----------------------------------------------------------------
+  */
+  /* we cannot call a non-volatile function on a volatile instance.
+   * but calling a volatile function is ok.
+   * XXX deleted non-volatile version.
+   */
+  INLINE int length() const volatile { return _AP_W; }
+
+  /*Return true if the value of ap_int_base instance is zero*/
+  INLINE bool iszero() const { return Base::V == 0; }
+
+  /*Return true if the value of ap_int_base instance is zero*/
+  INLINE bool is_zero() const { return Base::V == 0; }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (_AP_S &&
+        _AP_ROOT_op_get_bit(Base::V, _AP_W - 1))
+      return true;
+    else
+      return false;
+  }
+
+  /* x[i] = 0 */
+  INLINE void clear(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
+  }
+
+  /* x[i] = !x[i]*/
+  INLINE void invert(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    bool val = _AP_ROOT_op_get_bit(Base::V, i);
+    if (val)
+      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
+    else
+      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
+  }
+
+  INLINE bool test(int i) const {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    return _AP_ROOT_op_get_bit(Base::V, i);
+  }
+
+  // Get self. For ap_concat_ref expansion.
+  INLINE ap_int_base& get() { return *this; }
+
+  // Set the ith bit into 1
+  INLINE void set(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_int_base object n places to the left
+  INLINE ap_int_base& lrotate(int n) {
+    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
+    // TODO unify this.
+#ifdef __SYNTHESIS__
+    typeof(Base::V) l_p = Base::V << n;
+    typeof(Base::V) r_p = Base::V >> (_AP_W - n);
+    Base::V = l_p | r_p;
+#else
+    Base::V.lrotate(n);
+#endif
+    return *this;
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_int_base object n places to the right
+  INLINE ap_int_base& rrotate(int n) {
+    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
+    // TODO unify this.
+#ifdef __SYNTHESIS__
+    typeof(Base::V) l_p = Base::V << (_AP_W - n);
+    typeof(Base::V) r_p = Base::V >> n;
+    Base::V = l_p | r_p;
+#else
+    Base::V.rrotate(n);
+#endif
+    return *this;
+  }
+
+  // Reverse the contents of ap_int_base instance.
+  // I.e. LSB becomes MSB and vise versa.
+  INLINE ap_int_base& reverse() {
+    Base::V = _AP_ROOT_op_get_range(Base::V, _AP_W - 1, 0);
+    return *this;
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    return (bool)_AP_ROOT_op_get_bit(Base::V, i);
+  }
+
+  // complements every bit
+  INLINE void b_not() { Base::V = ~Base::V; }
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    Base::V Sym op2.V;                                                       \
+    return *this;                                                            \
+  }
+
+  /* Arithmetic assign.
+   * ----------------------------------------------------------------
+   */
+  OP_ASSIGN_AP(*=)
+  OP_ASSIGN_AP(+=)
+  OP_ASSIGN_AP(-=)
+  OP_ASSIGN_AP(/=)
+  OP_ASSIGN_AP(%=)
+#undef OP_ASSIGN_AP
+
+  /* Bitwise assign: and, or, xor.
+   * ----------------------------------------------------------------
+   */
+#define OP_ASSIGN_AP_CHK(Sym)                                                \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    _AP_WARNING((_AP_W != _AP_W2),                                           \
+                "Bitsize mismatch for ap_[u]int" #Sym "ap_[u]int.");         \
+    Base::V Sym op2.V;                                                       \
+    return *this;                                                            \
+  }
+  OP_ASSIGN_AP_CHK(&=)
+  OP_ASSIGN_AP_CHK(|=)
+  OP_ASSIGN_AP_CHK(^=)
+#undef OP_ASSIGN_AP_CHK
+
+  /* Prefix increment, decrement.
+   * ----------------------------------------------------------------
+   */
+  INLINE ap_int_base& operator++() {
+    operator+=((ap_int_base<1, false>)1);
+    return *this;
+  }
+  INLINE ap_int_base& operator--() {
+    operator-=((ap_int_base<1, false>)1);
+    return *this;
+  }
+
+  /* Postfix increment, decrement
+   * ----------------------------------------------------------------
+   */
+  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator++(int) {
+    ap_int_base t = *this;
+    operator+=((ap_int_base<1, false>)1);
+    return t;
+  }
+  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator--(int) {
+    ap_int_base t = *this;
+    operator-=((ap_int_base<1, false>)1);
+    return t;
+  }
+
+  /* Unary arithmetic.
+   * ----------------------------------------------------------------
+   */
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator+() const { return *this; }
+
+  // TODO used to be W>64 only... need check.
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_int_base<1, false>(0) - *this;
+  }
+
+  /* Not (!)
+   * ----------------------------------------------------------------
+   */
+  INLINE bool operator!() const { return Base::V == 0; }
+
+  /* Bitwise (arithmetic) unary: complement
+     ----------------------------------------------------------------
+  */
+  // XXX different from Mentor's ac_int!
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator~() const {
+    ap_int_base<_AP_W, _AP_S> r;
+    r.V = ~Base::V;
+    return r;
+  }
+
+  /* Shift (result constrained by left operand).
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, true>& op2) const {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator>>(sh);
+    } else
+      return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, false>& op2) const {
+    ap_int_base r;
+    r.V = Base::V << op2.to_uint();
+    return r;
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, true>& op2) const {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator<<(sh);
+    }
+    return operator>>(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, false>& op2) const {
+    ap_int_base r;
+    r.V = Base::V >> op2.to_uint();
+    return r;
+  }
+
+  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
+#if 0
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base operator<<(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
+    return *this << (op2.operator ap_int_base<_AP_W2, false>());
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base operator>>(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
+    return *this >> (op2.operator ap_int_base<_AP_W2, false>());
+  }
+#endif
+
+  /* Shift assign
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2>
+  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, true>& op2) {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator>>=(sh);
+    } else
+      return operator<<=(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, false>& op2) {
+    Base::V <<= op2.to_uint();
+    return *this;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, true>& op2) {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator<<=(sh);
+    }
+    return operator>>=(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, false>& op2) {
+    Base::V >>= op2.to_uint();
+    return *this;
+  }
+
+  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
+#if 0
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator<<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return *this <<= (op2.operator ap_int_base<_AP_W2, false>());
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator>>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return *this >>= (op2.operator ap_int_base<_AP_W2, false>());
+  }
+#endif
+
+  /* Equality and Relational.
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V == op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return !(Base::V == op2.V);
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V < op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V >= op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V > op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V <= op2.V;
+  }
+
+  /* Bit and Part Select
+   * ----------------------------------------------------------------
+   */
+  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
+    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
+    return ap_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  // This is a must to strip constness to produce reference type.
+  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
+    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
+    return ap_range_ref<_AP_W, _AP_S>(const_cast<ap_int_base*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> range() {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> range() const {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return this->range(Hi, Lo);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+#if 0
+  template<int Hi, int Lo>
+  INLINE ap_int_base<Hi-Lo+1, false> slice() const {
+    AP_ASSERT(Hi >= Lo && Hi < _AP_W && Lo < _AP_W, "Out of bounds in slice()");
+    ap_int_base<Hi-Lo+1, false> tmp ;
+    tmp.V = _AP_ROOT_op_get_range(Base::V, Lo, Hi);
+    return tmp;
+  }
+
+  INLINE ap_bit_ref<_AP_W,_AP_S> operator [] ( unsigned int uindex) {
+    AP_ASSERT(uindex < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W,_AP_S> bvh( this, uindex );
+    return bvh;
+  }
+#endif
+
+  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
+    return bvh;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
+    return bvh;
+  }
+
+  INLINE bool operator[](int index) const {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index);
+    return br.to_bool();
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator[](const ap_int_base<_AP_W2, _AP_S2>& index) const {
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index.to_int());
+    return br.to_bool();
+  }
+
+  INLINE ap_bit_ref<_AP_W, _AP_S> bit(int index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
+    return bvh;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref<_AP_W, _AP_S> bit(
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
+    return bvh;
+  }
+
+  INLINE bool bit(int index) const {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index);
+    return br.to_bool();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool bit(const ap_int_base<_AP_W2, _AP_S2>& index) const {
+    return bit(index.to_int());
+  }
+
+#if 0
+  template<typename _AP_T>
+  INLINE bool operator[](_AP_T index) const {
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W,_AP_S> br = operator[](index);
+    return br.to_bool();
+  }
+#endif
+
+  // Count the number of zeros from the most significant bit
+  // to the first one bit.
+  INLINE int countLeadingZeros() {
+#ifdef __SYNTHESIS__
+    if (_AP_W <= 32) {
+      ap_int_base<32, false> t(-1UL), x;
+      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
+      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
+      return __builtin_ctz(t.V); // count trailing zeros.
+    } else if (_AP_W <= 64) {
+      ap_int_base<64, false> t(-1ULL);
+      ap_int_base<64, false> x;
+      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
+      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
+      return __builtin_ctzll(t.V); // count trailing zeros.
+    } else {
+      enum { __N = (_AP_W + 63) / 64 };
+      int NZeros = 0;
+      int i = 0;
+      bool hitNonZero = false;
+      for (i = 0; i < __N - 1; ++i) {
+        ap_int_base<64, false> t;
+        t.V = _AP_ROOT_op_get_range(this->V, _AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
+        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); // count leading zeros.
+        hitNonZero |= (t.V != 0);
+      }
+      if (!hitNonZero) {
+        ap_int_base<64, false> t(-1ULL);
+        enum { REST = (_AP_W - 1) % 64 };
+        ap_int_base<64, false> x;
+        x.V = _AP_ROOT_op_get_range(this->V, 0, REST);
+        t.V = _AP_ROOT_op_set_range(t.V, 63 - REST, 63, x.V);
+        NZeros += __builtin_clzll(t.V);
+      }
+      return NZeros;
+    }
+#else
+    return (Base::V).countLeadingZeros();
+#endif
+  } // countLeadingZeros
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  concat(const ap_int_base<_AP_W2, _AP_S2>& a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  concat(ap_int_base<_AP_W2, _AP_S2>& a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this), a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, a2);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+                                                                         a2);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_int_base, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                &a2) const {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_int_base, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+                                                                       a2);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &a2) const {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(
+          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this & a2.get();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this | a2.get();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this ^ a2.get();
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    Base::V = val.V;
+  }
+
+  /* Reduce operations.
+   * ----------------------------------------------------------------
+   */
+  // XXX non-const version deleted.
+  INLINE bool and_reduce() const { return _AP_ROOT_op_reduce(and, Base::V); }
+  INLINE bool nand_reduce() const { return _AP_ROOT_op_reduce(nand, Base::V); }
+  INLINE bool or_reduce() const { return _AP_ROOT_op_reduce(or, Base::V); }
+  INLINE bool nor_reduce() const { return !(_AP_ROOT_op_reduce(or, Base::V)); }
+  INLINE bool xor_reduce() const { return _AP_ROOT_op_reduce (xor, Base::V); }
+  INLINE bool xnor_reduce() const {
+    return !(_AP_ROOT_op_reduce (xor, Base::V));
+  }
+
+  /* Output as a string.
+   * ----------------------------------------------------------------
+   */
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char rd = 2, bool sign = _AP_S) const {
+    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
+    // initialize sc_lv, which seems incapable of handling format "-0b".
+    if (rd == 2) sign = false;
+    return (Base::V).to_string(rd, sign);
+  }
+#else
+  INLINE char* to_string(signed char rd = 2, bool sign = _AP_S) const {
+    return 0;
+  }
+#endif
+}; // struct ap_int_base
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::ostream& operator<<(std::ostream& os,
+                                const ap_int_base<_AP_W, _AP_S>& x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.to_string(16); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.to_string(8); // don't print sign
+  } else {
+    os << x.to_string(10);
+  }
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::istream& operator>>(std::istream& in,
+                                ap_int_base<_AP_W, _AP_S>& op) {
+  std::string str;
+  in >> str;
+  const std::ios_base::fmtflags basefield = in.flags() & std::ios_base::basefield;
+  unsigned radix = (basefield == std::ios_base::dec) ? 0 : (
+                     (basefield == std::ios_base::oct) ? 8 : (
+                       (basefield == std::ios_base::hex) ? 16 : 0));
+  op = ap_int_base<_AP_W, _AP_S>(str.c_str(), radix);
+  return in;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/* Operators with another ap_int_base.
+ * ----------------------------------------------------------------
+ */
+#define OP_BIN_AP(Sym, Rty)                                                   \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
+      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
+                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base lhs(op);                                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base rhs(op2);                                 \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
+    ret.V = lhs.V Sym rhs.V;                                                  \
+    return ret;                                                               \
+  }
+
+OP_BIN_AP(*, mult)
+OP_BIN_AP(+, plus)
+OP_BIN_AP(-, minus)
+OP_BIN_AP(&, logic)
+OP_BIN_AP(|, logic)
+OP_BIN_AP(^, logic)
+
+#define OP_BIN_AP2(Sym, Rty)                                                  \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
+      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
+                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
+    ret.V = op.V Sym op2.V;                                                   \
+    return ret;                                                               \
+  }
+
+OP_BIN_AP2(/, div)
+OP_BIN_AP2(%, mod)
+
+// shift operators are defined inside class.
+// compound assignment operators are defined inside class.
+
+/* Operators with a pointer type.
+ * ----------------------------------------------------------------
+ *   char a[100];
+ *   char* ptr = a;
+ *   ap_int<2> n = 3;
+ *   char* ptr2 = ptr + n*2;
+ * avoid ambiguous errors.
+ */
+#define OP_BIN_WITH_PTR(BIN_OP)                                           \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
+  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                        \
+                                   const ap_int_base<_AP_W, _AP_S>& op) { \
+    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
+    return i_op BIN_OP op2;                                               \
+  }                                                                       \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
+  INLINE PTR_TYPE* operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                                   PTR_TYPE* i_op) {                      \
+    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
+    return op2 BIN_OP i_op;                                               \
+  }
+
+OP_BIN_WITH_PTR(+)
+OP_BIN_WITH_PTR(-)
+
+/* Operators with a native floating point types.
+ * ----------------------------------------------------------------
+ */
+// float OP ap_int
+// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
+#define OP_BIN_WITH_FLOAT(BIN_OP, C_TYPE)                              \
+  template <int _AP_W, bool _AP_S>                                     \
+  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                           \
+                                const ap_int_base<_AP_W, _AP_S>& op) { \
+    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
+    return i_op BIN_OP op2;                                            \
+  }                                                                    \
+  template <int _AP_W, bool _AP_S>                                     \
+  INLINE C_TYPE operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                                C_TYPE i_op) {                         \
+    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
+    return op2 BIN_OP i_op;                                            \
+  }
+
+#define ALL_OP_WITH_FLOAT(C_TYPE) \
+  OP_BIN_WITH_FLOAT(*, C_TYPE) \
+  OP_BIN_WITH_FLOAT(/, C_TYPE) \
+  OP_BIN_WITH_FLOAT(+, C_TYPE) \
+  OP_BIN_WITH_FLOAT(-, C_TYPE)
+
+#if _AP_ENABLE_HALF_ == 1
+ALL_OP_WITH_FLOAT(half)
+#endif
+ALL_OP_WITH_FLOAT(float)
+ALL_OP_WITH_FLOAT(double)
+
+// TODO no shift?
+
+/* Operators with a native integral types.
+ * ----------------------------------------------------------------
+ */
+// arithmetic and bitwise operators.
+#define OP_BIN_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)             \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
+                                                            _AP_S2>::RTYPE \
+  operator BIN_OP(C_TYPE i_op, const ap_int_base<_AP_W, _AP_S>& op) {      \
+    return ap_int_base<_AP_W2, _AP_S2>(i_op) BIN_OP(op);                   \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
+                                                            _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, C_TYPE i_op) {      \
+    return op BIN_OP ap_int_base<_AP_W2, _AP_S2>(i_op);                    \
+  }
+
+#define ALL_OP_BIN_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  OP_BIN_WITH_INT(*, C_TYPE, _AP_W2, _AP_S2, mult)  \
+  OP_BIN_WITH_INT(+, C_TYPE, _AP_W2, _AP_S2, plus)  \
+  OP_BIN_WITH_INT(-, C_TYPE, _AP_W2, _AP_S2, minus) \
+  OP_BIN_WITH_INT(/, C_TYPE, _AP_W2, _AP_S2, div)   \
+  OP_BIN_WITH_INT(%, C_TYPE, _AP_W2, _AP_S2, mod)   \
+  OP_BIN_WITH_INT(&, C_TYPE, _AP_W2, _AP_S2, logic) \
+  OP_BIN_WITH_INT(|, C_TYPE, _AP_W2, _AP_S2, logic) \
+  OP_BIN_WITH_INT(^, C_TYPE, _AP_W2, _AP_S2, logic)
+
+ALL_OP_BIN_WITH_INT(bool, 1, false)
+ALL_OP_BIN_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_BIN_WITH_INT(signed char, 8, true)
+ALL_OP_BIN_WITH_INT(unsigned char, 8, false)
+ALL_OP_BIN_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_BIN_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_BIN_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_BIN_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_BIN_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_BIN_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_BIN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_BIN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_BIN_WITH_INT
+#undef ALL_OP_BIN_WITH_INT
+
+// shift operators.
+#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    if (_AP_S2)                                          \
+      r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); \
+    else                                                 \
+      r.V = op.V << op2;                                 \
+    return r;                                            \
+  }                                                      \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    if (_AP_S2)                                          \
+      r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); \
+    else                                                 \
+      r.V = op.V >> op2;                                 \
+    return r;                                            \
+  }
+
+ALL_OP_SHIFT_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_SHIFT_WITH_INT(signed char, 8, true)
+ALL_OP_SHIFT_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_SHIFT_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_SHIFT_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_SHIFT_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+
+#undef ALL_OP_SHIFT_WITH_INT
+
+#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    r.V = op.V << op2;                                   \
+    return r;                                            \
+  }                                                      \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    r.V = op.V >> op2;                                   \
+    return r;                                            \
+  }
+ALL_OP_SHIFT_WITH_INT(bool, 1, false)
+ALL_OP_SHIFT_WITH_INT(unsigned char, 8, false)
+ALL_OP_SHIFT_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_SHIFT_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_SHIFT_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_SHIFT_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ALL_OP_SHIFT_WITH_INT
+
+// compound assign operators.
+#define OP_ASSIGN_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE ap_int_base<_AP_W, _AP_S>& operator ASSIGN_OP(             \
+      ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
+    return op ASSIGN_OP ap_int_base<_AP_W2, _AP_S2>(op2);           \
+  }
+
+// TODO int a; ap_int<16> b; a += b;
+
+#define ALL_OP_ASSIGN_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
+  OP_ASSIGN_WITH_INT(+=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(-=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(*=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(/=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(%=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(&=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(|=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(^=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(>>=, C_TYPE, _AP_W2, _AP_S2)      \
+  OP_ASSIGN_WITH_INT(<<=, C_TYPE, _AP_W2, _AP_S2)
+
+ALL_OP_ASSIGN_WITH_INT(bool, 1, false)
+ALL_OP_ASSIGN_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_ASSIGN_WITH_INT(signed char, 8, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned char, 8, false)
+ALL_OP_ASSIGN_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_ASSIGN_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_ASSIGN_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_ASSIGN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_ASSIGN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_ASSIGN_WITH_INT
+#undef ALL_OP_ASSIGN_WITH_INT
+
+// equality and relational operators.
+#define OP_REL_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)              \
+  template <int _AP_W, bool _AP_S>                                   \
+  INLINE bool operator REL_OP(C_TYPE i_op,                           \
+                              const ap_int_base<_AP_W, _AP_S>& op) { \
+    return ap_int_base<_AP_W2, _AP_S2>(i_op) REL_OP op;              \
+  }                                                                  \
+  template <int _AP_W, bool _AP_S>                                   \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                              C_TYPE op2) {                          \
+    return op REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);               \
+  }
+
+#define ALL_OP_REL_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
+  OP_REL_WITH_INT(>, C_TYPE, _AP_W2, _AP_S2)        \
+  OP_REL_WITH_INT(<, C_TYPE, _AP_W2, _AP_S2)        \
+  OP_REL_WITH_INT(>=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(<=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(==, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(!=, C_TYPE, _AP_W2, _AP_S2)
+
+ALL_OP_REL_WITH_INT(bool, 1, false)
+ALL_OP_REL_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_REL_WITH_INT(signed char, 8, true)
+ALL_OP_REL_WITH_INT(unsigned char, 8, false)
+ALL_OP_REL_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_REL_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_REL_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_REL_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_REL_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_REL_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_REL_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_REL_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_REL_WITH_INT
+#undef ALL_OP_BIN_WITH_INT
+
+#define OP_REL_WITH_DOUBLE_OR_FLOAT(Sym)                            \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
+                           double op2) {                            \
+    return op1.to_double() Sym op2 ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(double op1,                              \
+                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
+    return op1 Sym op2.to_double() ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
+                           float op2) {                             \
+    return op1.to_double() Sym op2 ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(float op1,                               \
+                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
+    return op1 Sym op2.to_double() ;                                \
+  }
+  OP_REL_WITH_DOUBLE_OR_FLOAT(>)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(<)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(>=)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(<=)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(==)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(!=)
+
+#undef OP_REL_WITH_DOUBLE_OR_FLOAT
+
+
+/* Operators with ap_bit_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+#define OP_BIN_WITH_RANGE(BIN_OP, RTYPE)                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                              _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,                   \
+                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    return ap_int_base<_AP_W1, false>(op1) BIN_OP op2;                       \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                              _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                    \
+                  const ap_range_ref<_AP_W2, _AP_S2>& op2) {                 \
+    return op1 BIN_OP ap_int_base<_AP_W2, false>(op2);                       \
+  }
+
+OP_BIN_WITH_RANGE(+, plus)
+OP_BIN_WITH_RANGE(-, minus)
+OP_BIN_WITH_RANGE(*, mult)
+OP_BIN_WITH_RANGE(/, div)
+OP_BIN_WITH_RANGE(%, mod)
+OP_BIN_WITH_RANGE(&, logic)
+OP_BIN_WITH_RANGE(|, logic)
+OP_BIN_WITH_RANGE(^, logic)
+OP_BIN_WITH_RANGE(>>, arg1)
+OP_BIN_WITH_RANGE(<<, arg1)
+
+#undef OP_BIN_WITH_RANGE
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_RANGE(ASSIGN_OP)                                      \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                    \
+      ap_int_base<_AP_W1, _AP_S1>& op1, ap_range_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1 ASSIGN_OP ap_int_base<_AP_W2, false>(op2);                    \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE ap_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
+      ap_range_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    ap_int_base<_AP_W1, false> tmp(op1);                                     \
+    tmp ASSIGN_OP op2;                                                       \
+    op1 = tmp;                                                               \
+    return op1;                                                              \
+  }
+
+OP_ASSIGN_WITH_RANGE(+=)
+OP_ASSIGN_WITH_RANGE(-=)
+OP_ASSIGN_WITH_RANGE(*=)
+OP_ASSIGN_WITH_RANGE(/=)
+OP_ASSIGN_WITH_RANGE(%=)
+OP_ASSIGN_WITH_RANGE(&=)
+OP_ASSIGN_WITH_RANGE(|=)
+OP_ASSIGN_WITH_RANGE(^=)
+OP_ASSIGN_WITH_RANGE(>>=)
+OP_ASSIGN_WITH_RANGE(<<=)
+
+#undef OP_ASSIGN_WITH_RANGE
+
+// equality and relational operators
+#define OP_REL_WITH_RANGE(REL_OP)                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE bool operator REL_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,     \
+                              const ap_int_base<_AP_W2, _AP_S2>& op2) {    \
+    return ap_int_base<_AP_W1, false>(op1).operator REL_OP(op2);           \
+  }                                                                        \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,      \
+                              const ap_range_ref<_AP_W2, _AP_S2>& op2) {   \
+    return op1.operator REL_OP(op2.operator ap_int_base<_AP_W2, false>()); \
+  }
+
+OP_REL_WITH_RANGE(==)
+OP_REL_WITH_RANGE(!=)
+OP_REL_WITH_RANGE(>)
+OP_REL_WITH_RANGE(>=)
+OP_REL_WITH_RANGE(<)
+OP_REL_WITH_RANGE(<=)
+
+#undef OP_REL_WITH_RANGE
+
+/* Operators with ap_bit_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+#define OP_BIN_WITH_BIT(BIN_OP, RTYPE)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                      \
+                  const ap_bit_ref<_AP_W2, _AP_S2>& op2) {                     \
+    return op1 BIN_OP ap_int_base<1, false>(op2);                              \
+  }                                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,                       \
+                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                    \
+    return ap_int_base<1, false>(op1) BIN_OP op2;                              \
+  }
+
+OP_BIN_WITH_BIT(+, plus)
+OP_BIN_WITH_BIT(-, minus)
+OP_BIN_WITH_BIT(*, mult)
+OP_BIN_WITH_BIT(/, div)
+OP_BIN_WITH_BIT(%, mod)
+OP_BIN_WITH_BIT(&, logic)
+OP_BIN_WITH_BIT(|, logic)
+OP_BIN_WITH_BIT(^, logic)
+OP_BIN_WITH_BIT(>>, arg1)
+OP_BIN_WITH_BIT(<<, arg1)
+
+#undef OP_BIN_WITH_BIT
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_BIT(ASSIGN_OP)                                      \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                  \
+      ap_int_base<_AP_W1, _AP_S1>& op1, ap_bit_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1 ASSIGN_OP ap_int_base<1, false>(op2);                       \
+  }                                                                        \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE ap_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
+      ap_bit_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    ap_int_base<1, false> tmp(op1);                                        \
+    tmp ASSIGN_OP op2;                                                     \
+    op1 = tmp;                                                             \
+    return op1;                                                            \
+  }
+
+OP_ASSIGN_WITH_BIT(+=)
+OP_ASSIGN_WITH_BIT(-=)
+OP_ASSIGN_WITH_BIT(*=)
+OP_ASSIGN_WITH_BIT(/=)
+OP_ASSIGN_WITH_BIT(%=)
+OP_ASSIGN_WITH_BIT(&=)
+OP_ASSIGN_WITH_BIT(|=)
+OP_ASSIGN_WITH_BIT(^=)
+OP_ASSIGN_WITH_BIT(>>=)
+OP_ASSIGN_WITH_BIT(<<=)
+
+#undef OP_ASSIGN_WITH_BIT
+
+// equality and relational operators.
+#define OP_REL_WITH_BIT(REL_OP)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,   \
+                              const ap_bit_ref<_AP_W2, _AP_S2>& op2) {  \
+    return op1 REL_OP ap_int_base<1, false>(op2);                       \
+  }                                                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
+  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,    \
+                              const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    return ap_int_base<1, false>(op1) REL_OP op2;                       \
+  }
+
+OP_REL_WITH_BIT(==)
+OP_REL_WITH_BIT(!=)
+OP_REL_WITH_BIT(>)
+OP_REL_WITH_BIT(>=)
+OP_REL_WITH_BIT(<)
+OP_REL_WITH_BIT(<=)
+
+#undef OP_REL_WITH_BIT
+
+
+/* Operators with ap_concat_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+// bitwise operators are defined in struct.
+// TODO specify whether to define arithmetic and bitwise operators.
+#if 0
+#define OP_BIN_WITH_CONCAT(BIN_OP, RTYPE)                                      \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
+                                                              false>::RTYPE    \
+  operator BIN_OP(const ap_int_base<_AP_W3, _AP_S3>& op1,                      \
+                  const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {  \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1 BIN_OP op2.get();                                               \
+  }                                                                            \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
+                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
+  operator BIN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,    \
+                  const ap_int_base<_AP_W3, _AP_S3>& op2) {                    \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1.get() BIN_OP op2;                                               \
+  }
+
+OP_BIN_WITH_CONCAT(+, plus)
+OP_BIN_WITH_CONCAT(-, minus)
+OP_BIN_WITH_CONCAT(*, mult)
+OP_BIN_WITH_CONCAT(/, div)
+OP_BIN_WITH_CONCAT(%, mod)
+OP_BIN_WITH_CONCAT(&, logic)
+OP_BIN_WITH_CONCAT(|, logic)
+OP_BIN_WITH_CONCAT(^, logic)
+OP_BIN_WITH_CONCAT(>>, arg1)
+OP_BIN_WITH_CONCAT(<<, arg1)
+
+#undef OP_BIN_WITH_CONCAT
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_CONCAT(ASSIGN_OP)                                       \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
+                                                              false>::RTYPE    \
+  operator ASSIGN_OP(                                                          \
+      const ap_int_base<_AP_W3, _AP_S3>& op1,                                  \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {              \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1 ASSIGN_OP op2.get();                                            \
+  }                                                                            \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
+                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
+  operator ASSIGN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \
+                     const ap_int_base<_AP_W3, _AP_S3>& op2) {                 \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    ap_int_base<_AP_W1 + _AP_W2, false> tmp = op1.get();                       \
+    tmp ASSIGN_OP op2;                                                         \
+    op1 = tmp;                                                                 \
+    return op1;                                                                \
+  }
+
+OP_ASSIGN_WITH_CONCAT(+=)
+OP_ASSIGN_WITH_CONCAT(-=)
+OP_ASSIGN_WITH_CONCAT(*=)
+OP_ASSIGN_WITH_CONCAT(/=)
+OP_ASSIGN_WITH_CONCAT(%=)
+OP_ASSIGN_WITH_CONCAT(&=)
+OP_ASSIGN_WITH_CONCAT(|=)
+OP_ASSIGN_WITH_CONCAT(^=)
+OP_ASSIGN_WITH_CONCAT(>>=)
+OP_ASSIGN_WITH_CONCAT(<<=)
+
+#undef OP_ASSIGN_WITH_CONCAT
+#endif
+
+// equality and relational operators.
+#define OP_REL_WITH_CONCAT(REL_OP)                                    \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
+            int _AP_W3, bool _AP_S3>                                  \
+  INLINE bool operator REL_OP(                                        \
+      const ap_int_base<_AP_W3, _AP_S3>& op1,                         \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {     \
+    /* convert ap_concat_ref to ap_int_base */                        \
+    return op1 REL_OP op2.get();                                      \
+  }                                                                   \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
+            int _AP_W3, bool _AP_S3>                                  \
+  INLINE bool operator REL_OP(                                        \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,       \
+      const ap_int_base<_AP_W3, _AP_S3>& op2) {                       \
+    /* convert ap_concat_ref to ap_int_base */                        \
+    return op1.get() REL_OP op2;                                      \
+  }
+
+OP_REL_WITH_CONCAT(==)
+OP_REL_WITH_CONCAT(!=)
+OP_REL_WITH_CONCAT(>)
+OP_REL_WITH_CONCAT(>=)
+OP_REL_WITH_CONCAT(<)
+OP_REL_WITH_CONCAT(<=)
+
+#undef OP_REL_WITH_CONCAT
+
+#endif // ifndef __cplusplus
+#endif // ifndef __AP_INT_BASE_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int_ref.h b/hls4ml/templates/vivado/ap_types/ap_int_ref.h
index 421f09fda6..c675ddd4b6 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int_ref.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int_ref.h
@@ -1,1346 +1,1346 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_REF_H__
-#define __AP_INT_REF_H__
-
-#ifndef __AP_INT_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-
-#else
-
-#ifndef __SYNTHESIS__
-#include <iostream>
-#endif
-
-/* Concatination reference.
-   ----------------------------------------------------------------
-*/
-template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
-struct ap_concat_ref {
-  enum {
-    _AP_WR = _AP_W1 + _AP_W2,
-  };
-
-  _AP_T1& mbv1;
-  _AP_T2& mbv2;
-
-  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& ref)
-      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
-
-  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
-    int W_ref1 = mbv1.length();
-    int W_ref2 = mbv2.length();
-    ap_int_base<_AP_W1, false> Part1;
-    Part1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
-    mbv1.set(Part1);
-    ap_int_base<_AP_W2, false> Part2;
-    Part2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
-    mbv2.set(Part2);
-    return *this;
-  }
-
-  // assign op from hls supported C integral types.
-  // FIXME disabled to support legacy code directly assign from sc_signal<T>
-  //template <typename T>
-  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
-  //                                    ap_concat_ref&>::type
-  //operator=(T val) {
-  //  ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-  //  return operator=(tmpVal);
-  //}
-#define ASSIGN_WITH_CTYPE(_Tp)                       \
-  INLINE ap_concat_ref& operator=(_Tp val) {         \
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); \
-    return operator=(tmpVal);                        \
-  }
-
-  ASSIGN_WITH_CTYPE(bool)
-  ASSIGN_WITH_CTYPE(char)
-  ASSIGN_WITH_CTYPE(signed char)
-  ASSIGN_WITH_CTYPE(unsigned char)
-  ASSIGN_WITH_CTYPE(short)
-  ASSIGN_WITH_CTYPE(unsigned short)
-  ASSIGN_WITH_CTYPE(int)
-  ASSIGN_WITH_CTYPE(unsigned int)
-  ASSIGN_WITH_CTYPE(long)
-  ASSIGN_WITH_CTYPE(unsigned long)
-  ASSIGN_WITH_CTYPE(ap_slong)
-  ASSIGN_WITH_CTYPE(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_WITH_CTYPE(half)
-#endif
-  ASSIGN_WITH_CTYPE(float)
-  ASSIGN_WITH_CTYPE(double)
-
-#undef ASSIGN_WITH_CTYPE
-
-  // Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE ap_concat_ref& operator=(
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-  INLINE ap_concat_ref& operator=(
-      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref& operator=(const ap_bit_ref<_AP_W3, _AP_S3>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref& operator=(const ap_range_ref<_AP_W3, _AP_S3>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref& operator=(
-      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
-    return operator=((const ap_int_base<_AP_W3, false>)(val));
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref& operator=(
-      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
-          val) {
-    return operator=(val.to_ap_int_base());
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref& operator=(
-      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
-    return operator=((ap_ulong)(bool)(val));
-  }
-
-  INLINE operator ap_int_base<_AP_WR, false>() const { return get(); }
-
-  INLINE operator ap_ulong() const { return get().to_uint64(); }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                       ap_range_ref<_AP_W3, _AP_S3> >
-  operator,(const ap_range_ref<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_range_ref<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_range_ref<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(ap_int_base<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(*this, a2);
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(const ap_int_base<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(const volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
-    // FIXME op's life does not seem long enough
-    ap_int_base<_AP_W3, _AP_S3> op(a2);
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(op));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >
-  operator,(const ap_bit_ref<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_bit_ref<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
-  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
-        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>&>(a2));
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref<
-      _AP_WR, ap_concat_ref, _AP_W3,
-      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
-  operator,(
-      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) {
-    return ap_concat_ref<
-        _AP_WR, ap_concat_ref, _AP_W3,
-        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-        *this,
-        const_cast<
-            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(a2));
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
-                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
-      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
-                    &a2) {
-    return ap_concat_ref<
-        _AP_WR, ap_concat_ref, 1,
-        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(
-            a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
-      const ap_int_base<_AP_W3, _AP_S3>& a2) {
-    return get() & a2;
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
-      const ap_int_base<_AP_W3, _AP_S3>& a2) {
-    return get() | a2;
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
-      const ap_int_base<_AP_W3, _AP_S3>& a2) {
-    return get() ^ a2;
-  }
-
-#if 0
-  template<int Hi, int Lo>
-  INLINE ap_int_base<Hi-Lo+1, false> slice() {
-    ap_int_base<_AP_WR, false> bv = get();
-    return bv.slice<Hi,Lo>();
-  }
-#endif
-
-  INLINE ap_int_base<_AP_WR, false> get() const {
-    ap_int_base<_AP_WR, false> tmpVal(0);
-    int W_ref1 = mbv1.length();
-    int W_ref2 = mbv2.length();
-    ap_int_base<_AP_W2, false> v2(mbv2);
-    ap_int_base<_AP_W1, false> v1(mbv1);
-    tmpVal.V = _AP_ROOT_op_set_range(tmpVal.V, 0, W_ref2 - 1, v2.V);
-    tmpVal.V =
-        _AP_ROOT_op_set_range(tmpVal.V, W_ref2, W_ref1 + W_ref2 - 1, v1.V);
-    return tmpVal;
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
-    int W_ref1 = mbv1.length();
-    int W_ref2 = mbv2.length();
-    ap_int_base<_AP_W1, false> tmpVal1;
-    tmpVal1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
-    mbv1.set(tmpVal1);
-    ap_int_base<_AP_W2, false> tmpVal2;
-    tmpVal2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
-    mbv2.set(tmpVal2);
-  }
-
-  INLINE int length() const { return mbv1.length() + mbv2.length(); }
-}; // struct ap_concat_ref
-
-/* Range (slice) reference.
-   ----------------------------------------------------------------
-*/
-template <int _AP_W, bool _AP_S>
-struct ap_range_ref {
-  // struct ssdm_int or its sim model.
-  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
-  //      and then we can retire af_range_ref.
-  typedef ap_int_base<_AP_W, _AP_S> ref_type;
-  ref_type& d_bv;
-  int l_index;
-  int h_index;
-
- public:
-  INLINE ap_range_ref(const ap_range_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
-
-  INLINE ap_range_ref(ref_type* bv, int h, int l)
-      : d_bv(*bv), l_index(l), h_index(h) {}
-
-  INLINE ap_range_ref(const ref_type* bv, int h, int l)
-      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {}
-
-  INLINE operator ap_int_base<_AP_W, false>() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  INLINE operator ap_ulong() const { return to_uint64(); }
-
-  /// @name assign operators
-  //  @{
-
-  // FIXME disabled to work-around lagacy code assigning from sc_signal<T>,
-  // which dependes on implicit type conversion.
-  //
-  //   /// assign from hls supported C integral types.
-  //   template <typename T>
-  //   INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
-  //                                       ap_range_ref&>::type
-  //   operator=(T val) {
-  //     ap_int_base<_AP_W, false> tmp(val);
-  //     d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-  //     return *this;
-  //   }
-#define ASSIGN_WITH_CTYPE(_Tp)                                       \
-  INLINE ap_range_ref& operator=(_Tp val) {                          \
-    ap_int_base<_AP_W, false> tmp(val);                              \
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); \
-    return *this;                                                    \
-  }
-
-  ASSIGN_WITH_CTYPE(bool)
-  ASSIGN_WITH_CTYPE(char)
-  ASSIGN_WITH_CTYPE(signed char)
-  ASSIGN_WITH_CTYPE(unsigned char)
-  ASSIGN_WITH_CTYPE(short)
-  ASSIGN_WITH_CTYPE(unsigned short)
-  ASSIGN_WITH_CTYPE(int)
-  ASSIGN_WITH_CTYPE(unsigned int)
-  ASSIGN_WITH_CTYPE(long)
-  ASSIGN_WITH_CTYPE(unsigned long)
-  ASSIGN_WITH_CTYPE(ap_slong)
-  ASSIGN_WITH_CTYPE(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_WITH_CTYPE(half)
-#endif
-  ASSIGN_WITH_CTYPE(float)
-  ASSIGN_WITH_CTYPE(double)
-
-#undef ASSIGN_WITH_CTYPE
-
-  /// assign using string. XXX crucial for cosim.
-  INLINE ap_range_ref& operator=(const char* val) {
-    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-    return *this;
-  }
-
-  /// assign from ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
-    ap_int_base<_AP_W, false> tmp(val);
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-    return *this;
-  }
-
-  /// copy assign operator
-  // XXX Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE ap_range_ref& operator=(const ap_range_ref& val) {
-    return operator=((const ap_int_base<_AP_W, false>)val);
-  }
-
-  /// assign from range reference to ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((const ap_int_base<_AP_W2, false>)val);
-  }
-
-  /// assign from bit reference to ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_ulong)(bool)(val));
-  }
-
-  /// assign from ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_range_ref& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          val) {
-    return operator=(val.to_ap_int_base());
-  }
-
-  /// assign from range reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_range_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((const ap_int_base<_AP_W2, false>)val);
-  }
-
-  /// assign from bit reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_range_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((ap_ulong)(bool)(val));
-  }
-
-  /// assign from compound reference.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_range_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)(val));
-  }
-  //  @}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  INLINE
-  ap_concat_ref<_AP_W, ap_range_ref, _AP_W, ap_int_base<_AP_W, _AP_S> >
-  operator,(ap_int_base<_AP_W, _AP_S>& a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W,
-                         ap_int_base<_AP_W, _AP_S> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      _AP_W, ap_range_ref, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> a2) {
-    return ap_concat_ref<
-        _AP_W, ap_range_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<
-            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                    &a2) {
-    return ap_concat_ref<
-        _AP_W, ap_range_ref, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> hop(op2);
-    return lop == hop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator==(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> hop(op2);
-    return lop < hop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> hop(op2);
-    return lop <= hop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator<=(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator<(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
-      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V |= (op2.d_bv).V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V |= op2.V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
-      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V &= (op2.d_bv).V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V &= op2.V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
-      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V ^= (op2.d_bv).V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V ^= op2.V;
-    return *this;
-  };
-
-  INLINE ap_int_base<_AP_W, false> get() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  template <int _AP_W2>
-  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-  }
-
-  INLINE int length() const {
-    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
-  }
-
-  INLINE int to_int() const {
-    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned to_uint() const {
-    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE long to_long() const {
-    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned long to_ulong() const {
-    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_slong to_int64() const {
-    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_ulong to_uint64() const {
-    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE bool and_reduce() const {
-    bool ret = true;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) {
-#ifdef __SYNTHESIS__
-#pragma HLS unroll
-#endif
-      ret &= _AP_ROOT_op_get_bit(d_bv.V, i);
-    }
-    return ret;
-  }
-
-  INLINE bool or_reduce() const {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) {
-#ifdef __SYNTHESIS__
-#pragma HLS unroll
-#endif
-      ret |= _AP_ROOT_op_get_bit(d_bv.V, i);
-    }
-    return ret;
-  }
-
-  INLINE bool xor_reduce() const {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) {
-#ifdef __SYNTHESIS__
-#pragma HLS unroll
-#endif
-      ret ^= _AP_ROOT_op_get_bit(d_bv.V, i);
-    }
-    return ret;
-  }
-#ifndef __SYNTHESIS__
-  std::string to_string(signed char radix = 2) const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret.to_string(radix);
-  }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string(signed char radix = 2) const {
-    return 0;
-  }
-#endif
-}; // struct ap_range_ref
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::ostream& operator<<(std::ostream& os,
-                                const ap_range_ref<_AP_W, _AP_S>& x) {
-  std::ios_base::fmtflags ff = std::cout.flags();
-  if (ff & std::cout.hex) {
-    os << x.to_string(16); // don't print sign
-  } else if (ff & std::cout.oct) {
-    os << x.to_string(8); // don't print sign
-  } else {
-    os << x.to_string(10);
-  }
-  return os;
-}
-#endif // ifndef __SYNTHESIS__
-
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::istream& operator>>(std::istream& in,
-                                ap_range_ref<_AP_W, _AP_S>& op) {
-  std::string str;
-  in >> str;
-  op = ap_int_base<_AP_W, _AP_S>(str.c_str());
-  return in;
-}
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_AUTOCC
-
-/* Bit reference.
-   ----------------------------------------------------------------
-*/
-template <int _AP_W, bool _AP_S>
-struct ap_bit_ref {
-  // struct ssdm_int or its sim model.
-  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
-  //      and then we can retire af_bit_ref.
-  typedef ap_int_base<_AP_W, _AP_S> ref_type;
-  ref_type& d_bv;
-  int d_index;
-
- public:
-  // copy ctor
-  INLINE ap_bit_ref(const ap_bit_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), d_index(ref.d_index) {}
-
-  INLINE ap_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
-
-  INLINE ap_bit_ref(const ref_type* bv, int index = 0)
-      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
-
-  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-  INLINE bool to_bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  // assign op from hls supported C integral types.
-  // FIXME disabled to support sc_signal<bool>.
-  // NOTE this used to be unsigned long long.
-  //template <typename T>
-  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
-  //                                    ap_bit_ref&>::type
-  //operator=(T val) {
-  //  d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
-  //  return *this;
-  //}
-#define ASSIGN_WITH_CTYPE(_Tp)                          \
-  INLINE ap_bit_ref& operator=(_Tp val) {               \
-    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); \
-    return *this;                                       \
-  }
-
-  ASSIGN_WITH_CTYPE(bool)
-  ASSIGN_WITH_CTYPE(char)
-  ASSIGN_WITH_CTYPE(signed char)
-  ASSIGN_WITH_CTYPE(unsigned char)
-  ASSIGN_WITH_CTYPE(short)
-  ASSIGN_WITH_CTYPE(unsigned short)
-  ASSIGN_WITH_CTYPE(int)
-  ASSIGN_WITH_CTYPE(unsigned int)
-  ASSIGN_WITH_CTYPE(long)
-  ASSIGN_WITH_CTYPE(unsigned long)
-  ASSIGN_WITH_CTYPE(ap_slong)
-  ASSIGN_WITH_CTYPE(ap_ulong)
-
-#undef ASSIGN_WITH_CTYPE
-
-#define ASSIGN_WITH_CTYPE_FP(_Tp)                           \
-  INLINE ap_bit_ref& operator=(_Tp val) {                   \
-    bool tmp_val = val;                                     \
-    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index,tmp_val);  \
-    return *this;                                           \
-  }
-
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_WITH_CTYPE_FP(half)
-#endif
-  ASSIGN_WITH_CTYPE_FP(float)
-  ASSIGN_WITH_CTYPE_FP(double)
-
-#undef ASSIGN_WITH_CTYPE_FP
-
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_ulong)(val.V != 0));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_int_base<_AP_W2, false>)val);
-  }
-
-  // Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE ap_bit_ref& operator=(const ap_bit_ref& val) {
-    return operator=((ap_ulong)(bool)val);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_ulong)(bool)val);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_bit_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((const ap_int_base<_AP_W2, false>)val);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_bit_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((ap_ulong)(bool)val);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_bit_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)val);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
-    ap_int_base<_AP_W2, _AP_S2> op(a2);
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    ap_int_base<_AP_W2, _AP_S2> op(a2);
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
-      const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      1, ap_bit_ref, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<
-        1, ap_bit_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<
-            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                    _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                      _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
-    return get() == op.get();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
-    return get() != op.get();
-  }
-
-  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  INLINE bool get() { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
-    operator=(val);
-  }
-
-  INLINE bool operator~() const {
-    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
-    return bit ? false : true;
-  }
-
-  INLINE int length() const { return 1; }
-
-#ifndef __SYNTHESIS__
-  std::string to_string() const { return get() ? "1" : "0"; }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string() const { return 0; }
-#endif
-}; // struct ap_bit_ref
-
-/* ap_range_ref with int.
- * ------------------------------------------------------------
- */
-// equality and relational operators.
-#define REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(const ap_range_ref<_AP_W, _AP_S>& op,        \
-                              C_TYPE op2) {                                \
-    return ap_int_base<_AP_W, false>(op)                                   \
-        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W, _AP_S>& op,          \
-                              C_TYPE op2) {                                \
-    return bool(op) REL_OP op2;                                            \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(C_TYPE op2,                                  \
-                              const ap_bit_ref<_AP_W, _AP_S>& op) {        \
-    return op2 REL_OP bool(op);                                            \
-  }                                                                        \
-  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
-  INLINE bool operator REL_OP(                                             \
-      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, C_TYPE op2) { \
-    return ap_int_base<_AP_W + _AP_W1, false>(op)                          \
-        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
-  }
-
-// Make the line shorter than 5000 chars
-#define REF_REL_WITH_INT_1(C_TYPE, _AP_WI, _AP_SI) \
-  REF_REL_OP_WITH_INT(>, C_TYPE, _AP_WI, _AP_SI)   \
-  REF_REL_OP_WITH_INT(<, C_TYPE, _AP_WI, _AP_SI)   \
-  REF_REL_OP_WITH_INT(>=, C_TYPE, _AP_WI, _AP_SI)  \
-  REF_REL_OP_WITH_INT(<=, C_TYPE, _AP_WI, _AP_SI)
-
-REF_REL_WITH_INT_1(bool, 1, false)
-REF_REL_WITH_INT_1(char, 8, CHAR_IS_SIGNED)
-REF_REL_WITH_INT_1(signed char, 8, true)
-REF_REL_WITH_INT_1(unsigned char, 8, false)
-REF_REL_WITH_INT_1(short, _AP_SIZE_short, true)
-REF_REL_WITH_INT_1(unsigned short, _AP_SIZE_short, false)
-REF_REL_WITH_INT_1(int, _AP_SIZE_int, true)
-REF_REL_WITH_INT_1(unsigned int, _AP_SIZE_int, false)
-REF_REL_WITH_INT_1(long, _AP_SIZE_long, true)
-REF_REL_WITH_INT_1(unsigned long, _AP_SIZE_long, false)
-REF_REL_WITH_INT_1(ap_slong, _AP_SIZE_ap_slong, true)
-REF_REL_WITH_INT_1(ap_ulong, _AP_SIZE_ap_slong, false)
-
-// Make the line shorter than 5000 chars
-#define REF_REL_WITH_INT_2(C_TYPE, _AP_WI, _AP_SI) \
-  REF_REL_OP_WITH_INT(==, C_TYPE, _AP_WI, _AP_SI)  \
-  REF_REL_OP_WITH_INT(!=, C_TYPE, _AP_WI, _AP_SI)
-
-REF_REL_WITH_INT_2(bool, 1, false)
-REF_REL_WITH_INT_2(char, 8, CHAR_IS_SIGNED)
-REF_REL_WITH_INT_2(signed char, 8, true)
-REF_REL_WITH_INT_2(unsigned char, 8, false)
-REF_REL_WITH_INT_2(short, _AP_SIZE_short, true)
-REF_REL_WITH_INT_2(unsigned short, _AP_SIZE_short, false)
-REF_REL_WITH_INT_2(int, _AP_SIZE_int, true)
-REF_REL_WITH_INT_2(unsigned int, _AP_SIZE_int, false)
-REF_REL_WITH_INT_2(long, _AP_SIZE_long, true)
-REF_REL_WITH_INT_2(unsigned long, _AP_SIZE_long, false)
-REF_REL_WITH_INT_2(ap_slong, _AP_SIZE_ap_slong, true)
-REF_REL_WITH_INT_2(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef REF_REL_OP_WITH_INT
-#undef REF_REL_WITH_INT_1
-#undef REF_REL_WITH_INT_2
-
-#define REF_BIN_OP_WITH_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)          \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE typename ap_int_base<_AP_W, false>::template RType<_AP_W2,         \
-                                                            _AP_S2>::RTYPE  \
-  operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& op, C_TYPE op2) {       \
-    return ap_int_base<_AP_W, false>(op)                                    \
-        BIN_OP ap_int_base<_AP_W2, _AP_S2>(op2);                            \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE typename ap_int_base<_AP_W2, _AP_S2>::template RType<_AP_W,        \
-                                                              false>::RTYPE \
-  operator BIN_OP(C_TYPE op2, const ap_range_ref<_AP_W, _AP_S>& op) {       \
-    return ap_int_base<_AP_W2, _AP_S2>(op2)                                 \
-        BIN_OP ap_int_base<_AP_W, false>(op);                               \
-  }
-
-// arithmetic operators.
-#define REF_BIN_OP_WITH_INT_ARITH(C_TYPE, _AP_W2, _AP_S2)   \
-  REF_BIN_OP_WITH_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_WITH_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
-  REF_BIN_OP_WITH_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_BIN_OP_WITH_INT_ARITH(bool, 1, false)
-REF_BIN_OP_WITH_INT_ARITH(char, 8, CHAR_IS_SIGNED)
-REF_BIN_OP_WITH_INT_ARITH(signed char, 8, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned char, 8, false)
-REF_BIN_OP_WITH_INT_ARITH(short, _AP_SIZE_short, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned short, _AP_SIZE_short, false)
-REF_BIN_OP_WITH_INT_ARITH(int, _AP_SIZE_int, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned int, _AP_SIZE_int, false)
-REF_BIN_OP_WITH_INT_ARITH(long, _AP_SIZE_long, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned long, _AP_SIZE_long, false)
-REF_BIN_OP_WITH_INT_ARITH(ap_slong, _AP_SIZE_ap_slong, true)
-REF_BIN_OP_WITH_INT_ARITH(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef REF_BIN_OP_WITH_INT_ARITH
-
-// bitwise and shift operators
-#define REF_BIN_OP_WITH_INT_BITS(C_TYPE, _AP_W2, _AP_S2)     \
-  REF_BIN_OP_WITH_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_WITH_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_BIN_OP_WITH_INT_BITS(bool, 1, false)
-REF_BIN_OP_WITH_INT_BITS(char, 8, CHAR_IS_SIGNED)
-REF_BIN_OP_WITH_INT_BITS(signed char, 8, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned char, 8, false)
-REF_BIN_OP_WITH_INT_BITS(short, _AP_SIZE_short, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned short, _AP_SIZE_short, false)
-REF_BIN_OP_WITH_INT_BITS(int, _AP_SIZE_int, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned int, _AP_SIZE_int, false)
-REF_BIN_OP_WITH_INT_BITS(long, _AP_SIZE_long, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned long, _AP_SIZE_long, false)
-REF_BIN_OP_WITH_INT_BITS(ap_slong, _AP_SIZE_ap_slong, true)
-REF_BIN_OP_WITH_INT_BITS(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef REF_BIN_OP_WITH_INT_BITS
-
-/* ap_range_ref with ap_range_ref
- *  ------------------------------------------------------------
- */
-#define REF_BIN_OP(BIN_OP, RTYPE)                                              \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                    \
-  INLINE                                                                       \
-      typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
-      operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& lhs,                   \
-                      const ap_range_ref<_AP_W2, _AP_S2>& rhs) {               \
-    return (lhs.operator ap_int_base<_AP_W, false>())BIN_OP(                   \
-        rhs.operator ap_int_base<_AP_W2, false>());                            \
-  }
-
-REF_BIN_OP(+, plus)
-REF_BIN_OP(-, minus)
-REF_BIN_OP(*, mult)
-REF_BIN_OP(/, div)
-REF_BIN_OP(%, mod)
-REF_BIN_OP(&, logic)
-REF_BIN_OP(|, logic)
-REF_BIN_OP(^, logic)
-REF_BIN_OP(>>, arg1)
-REF_BIN_OP(<<, arg1)
-
-/* ap_concat_ref with ap_concat_ref.
- *  ------------------------------------------------------------
- */
-
-//************************************************************************
-//  Implement
-//      ap_int_base<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
-//  for operators  +, -, *, /, %, >>, <<, &, |, ^
-//  Without these operators the operands are converted to int64 and
-//  larger results lose informations (higher order bits).
-//
-//                       operand OP
-//                      /          |
-//              left-concat         right-concat
-//                /     |            /         |
-//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>    <RW2,RT2>
-//
-//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
-//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
-//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
-//
-//  In Verilog 2001 result of concatenation is always unsigned even
-//  when both sides are signed.
-//************************************************************************
-
-#undef SYN_CONCAT_REF_BIN_OP
-
-#define SYN_CONCAT_REF_BIN_OP(BIN_OP, RTYPE)                              \
-  template <int _AP_LW1, typename _AP_LT1, int _AP_LW2, typename _AP_LT2, \
-            int _AP_RW1, typename _AP_RT1, int _AP_RW2, typename _AP_RT2> \
-  INLINE typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType<  \
-      _AP_RW1 + _AP_RW2, false>::RTYPE                                    \
-  operator BIN_OP(                                                        \
-      const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs,       \
-      const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) {     \
-    return lhs.get() BIN_OP rhs.get();                                    \
-  }
-
-SYN_CONCAT_REF_BIN_OP(+, plus)
-SYN_CONCAT_REF_BIN_OP(-, minus)
-SYN_CONCAT_REF_BIN_OP(*, mult)
-SYN_CONCAT_REF_BIN_OP(/, div)
-SYN_CONCAT_REF_BIN_OP(%, mod)
-SYN_CONCAT_REF_BIN_OP(&, logic)
-SYN_CONCAT_REF_BIN_OP(|, logic)
-SYN_CONCAT_REF_BIN_OP(^, logic)
-SYN_CONCAT_REF_BIN_OP(>>, arg1)
-SYN_CONCAT_REF_BIN_OP(<<, arg1)
-
-#undef SYN_CONCAT_REF_BIN_OP
-
-#define CONCAT_OP_WITH_INT(C_TYPE, _AP_WI, _AP_SI)                          \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      const ap_int_base<_AP_W, _AP_S> &op1, C_TYPE op2) {                   \
-    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
-    ret <<= _AP_WI;                                                         \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W;                                                        \
-      val >>= _AP_W;                                                        \
-    }                                                                       \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      C_TYPE op1, const ap_int_base<_AP_W, _AP_S> &op2) {                   \
-    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
-    if (_AP_S) {                                                            \
-      ret <<= _AP_WI;                                                       \
-      ret >>= _AP_WI;                                                       \
-    }                                                                       \
-    ret |= val << _AP_W;                                                    \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      const ap_range_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                  \
-    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
-    ret <<= _AP_WI;                                                         \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W;                                                        \
-      val >>= _AP_W;                                                        \
-    }                                                                       \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      C_TYPE op1, const ap_range_ref<_AP_W, _AP_S> &op2) {                  \
-    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
-    int len = op2.length();                                                 \
-    val <<= len;                                                            \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
-      const ap_bit_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                    \
-    ap_int_base<_AP_WI + 1, false> val(op2);                                \
-    val[_AP_WI] = op1;                                                      \
-    return val;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
-      C_TYPE op1, const ap_bit_ref<_AP_W, _AP_S> &op2) {                    \
-    ap_int_base<_AP_WI + 1, false> val(op1);                                \
-    val <<= 1;                                                              \
-    val[0] = op2;                                                           \
-    return val;                                                             \
-  }                                                                         \
-  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
-  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
-      const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, C_TYPE op2) { \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op2);                  \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op1);                  \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W + _AP_W2;                                               \
-      val >>= _AP_W + _AP_W2;                                               \
-    }                                                                       \
-    ret <<= _AP_WI;                                                         \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
-  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
-      C_TYPE op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op1);                  \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op2);                  \
-    int len = op2.length();                                                 \
-    val <<= len;                                                            \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,    \
-      C_TYPE op2) {                                                         \
-    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W;                                                        \
-      val >>= _AP_W;                                                        \
-    }                                                                       \
-    ret <<= _AP_WI;                                                         \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      C_TYPE op1,                                                           \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {  \
-    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
-    int len = op2.length();                                                 \
-    val <<= len;                                                            \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,      \
-      C_TYPE op2) {                                                         \
-    ap_int_base<_AP_WI + 1, _AP_SI> val(op2);                               \
-    val[_AP_WI] = op1;                                                      \
-    return val;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
-      C_TYPE op1,                                                           \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {    \
-    ap_int_base<_AP_WI + 1, _AP_SI> val(op1);                               \
-    val <<= 1;                                                              \
-    val[0] = op2;                                                           \
-    return val;                                                             \
-  }
-
-CONCAT_OP_WITH_INT(bool, 1, false)
-CONCAT_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
-CONCAT_OP_WITH_INT(signed char, 8, true)
-CONCAT_OP_WITH_INT(unsigned char, 8, false)
-CONCAT_OP_WITH_INT(short, _AP_SIZE_short, true)
-CONCAT_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
-CONCAT_OP_WITH_INT(int, _AP_SIZE_int, true)
-CONCAT_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
-CONCAT_OP_WITH_INT(long, _AP_SIZE_long, true)
-CONCAT_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
-CONCAT_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-CONCAT_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef CONCAT_OP_WITH_INT
-
-#define CONCAT_SHIFT_WITH_INT(C_TYPE, OP)                                  \
-  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
-  INLINE ap_uint<_AP_W + _AP_W1> operator OP(                              \
-      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, C_TYPE rhs) { \
-    return ap_uint<_AP_W + _AP_W1>(lhs).get() OP int(rhs);                 \
-  }
-
-// FIXME int(rhs) may loose precision.
-
-CONCAT_SHIFT_WITH_INT(int, <<)
-CONCAT_SHIFT_WITH_INT(unsigned int, <<)
-CONCAT_SHIFT_WITH_INT(long, <<)
-CONCAT_SHIFT_WITH_INT(unsigned long, <<)
-CONCAT_SHIFT_WITH_INT(ap_slong, <<)
-CONCAT_SHIFT_WITH_INT(ap_ulong, <<)
-
-CONCAT_SHIFT_WITH_INT(int, >>)
-CONCAT_SHIFT_WITH_INT(unsigned int, >>)
-CONCAT_SHIFT_WITH_INT(long, >>)
-CONCAT_SHIFT_WITH_INT(unsigned long, >>)
-CONCAT_SHIFT_WITH_INT(ap_slong, >>)
-CONCAT_SHIFT_WITH_INT(ap_ulong, >>)
-
-#endif // ifndef __cplusplus
-#endif // ifndef __AP_INT_REF_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_REF_H__
+#define __AP_INT_REF_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+
+#else
+
+#ifndef __SYNTHESIS__
+#include <iostream>
+#endif
+
+/* Concatination reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+struct ap_concat_ref {
+  enum {
+    _AP_WR = _AP_W1 + _AP_W2,
+  };
+
+  _AP_T1& mbv1;
+  _AP_T2& mbv2;
+
+  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& ref)
+      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
+
+  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W1, false> Part1;
+    Part1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
+    mbv1.set(Part1);
+    ap_int_base<_AP_W2, false> Part2;
+    Part2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
+    mbv2.set(Part2);
+    return *this;
+  }
+
+  // assign op from hls supported C integral types.
+  // FIXME disabled to support legacy code directly assign from sc_signal<T>
+  //template <typename T>
+  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                    ap_concat_ref&>::type
+  //operator=(T val) {
+  //  ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+  //  return operator=(tmpVal);
+  //}
+#define ASSIGN_WITH_CTYPE(_Tp)                       \
+  INLINE ap_concat_ref& operator=(_Tp val) {         \
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); \
+    return operator=(tmpVal);                        \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE(half)
+#endif
+  ASSIGN_WITH_CTYPE(float)
+  ASSIGN_WITH_CTYPE(double)
+
+#undef ASSIGN_WITH_CTYPE
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_concat_ref& operator=(
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+  INLINE ap_concat_ref& operator=(
+      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_bit_ref<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_range_ref<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+    return operator=((const ap_int_base<_AP_W3, false>)(val));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
+          val) {
+    return operator=(val.to_ap_int_base());
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  INLINE operator ap_int_base<_AP_WR, false>() const { return get(); }
+
+  INLINE operator ap_ulong() const { return get().to_uint64(); }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                       ap_range_ref<_AP_W3, _AP_S3> >
+  operator,(const ap_range_ref<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_range_ref<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_range_ref<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(*this, a2);
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(const ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(const volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
+    // FIXME op's life does not seem long enough
+    ap_int_base<_AP_W3, _AP_S3> op(a2);
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(op));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >
+  operator,(const ap_bit_ref<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_bit_ref<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
+  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
+        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>&>(a2));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref<
+      _AP_WR, ap_concat_ref, _AP_W3,
+      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+  operator,(
+      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) {
+    return ap_concat_ref<
+        _AP_WR, ap_concat_ref, _AP_W3,
+        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(a2));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
+                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
+                    &a2) {
+    return ap_concat_ref<
+        _AP_WR, ap_concat_ref, 1,
+        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(
+            a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() & a2;
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() | a2;
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() ^ a2;
+  }
+
+#if 0
+  template<int Hi, int Lo>
+  INLINE ap_int_base<Hi-Lo+1, false> slice() {
+    ap_int_base<_AP_WR, false> bv = get();
+    return bv.slice<Hi,Lo>();
+  }
+#endif
+
+  INLINE ap_int_base<_AP_WR, false> get() const {
+    ap_int_base<_AP_WR, false> tmpVal(0);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W2, false> v2(mbv2);
+    ap_int_base<_AP_W1, false> v1(mbv1);
+    tmpVal.V = _AP_ROOT_op_set_range(tmpVal.V, 0, W_ref2 - 1, v2.V);
+    tmpVal.V =
+        _AP_ROOT_op_set_range(tmpVal.V, W_ref2, W_ref1 + W_ref2 - 1, v1.V);
+    return tmpVal;
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W1, false> tmpVal1;
+    tmpVal1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
+    mbv1.set(tmpVal1);
+    ap_int_base<_AP_W2, false> tmpVal2;
+    tmpVal2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
+    mbv2.set(tmpVal2);
+  }
+
+  INLINE int length() const { return mbv1.length() + mbv2.length(); }
+}; // struct ap_concat_ref
+
+/* Range (slice) reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W, bool _AP_S>
+struct ap_range_ref {
+  // struct ssdm_int or its sim model.
+  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
+  //      and then we can retire af_range_ref.
+  typedef ap_int_base<_AP_W, _AP_S> ref_type;
+  ref_type& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  INLINE ap_range_ref(const ap_range_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  INLINE ap_range_ref(ref_type* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {}
+
+  INLINE ap_range_ref(const ref_type* bv, int h, int l)
+      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {}
+
+  INLINE operator ap_int_base<_AP_W, false>() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  INLINE operator ap_ulong() const { return to_uint64(); }
+
+  /// @name assign operators
+  //  @{
+
+  // FIXME disabled to work-around lagacy code assigning from sc_signal<T>,
+  // which dependes on implicit type conversion.
+  //
+  //   /// assign from hls supported C integral types.
+  //   template <typename T>
+  //   INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                       ap_range_ref&>::type
+  //   operator=(T val) {
+  //     ap_int_base<_AP_W, false> tmp(val);
+  //     d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+  //     return *this;
+  //   }
+#define ASSIGN_WITH_CTYPE(_Tp)                                       \
+  INLINE ap_range_ref& operator=(_Tp val) {                          \
+    ap_int_base<_AP_W, false> tmp(val);                              \
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); \
+    return *this;                                                    \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE(half)
+#endif
+  ASSIGN_WITH_CTYPE(float)
+  ASSIGN_WITH_CTYPE(double)
+
+#undef ASSIGN_WITH_CTYPE
+
+  /// assign using string. XXX crucial for cosim.
+  INLINE ap_range_ref& operator=(const char* val) {
+    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// assign from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    ap_int_base<_AP_W, false> tmp(val);
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// copy assign operator
+  // XXX Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_range_ref& operator=(const ap_range_ref& val) {
+    return operator=((const ap_int_base<_AP_W, false>)val);
+  }
+
+  /// assign from range reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  /// assign from bit reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  /// assign from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          val) {
+    return operator=(val.to_ap_int_base());
+  }
+
+  /// assign from range reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  /// assign from bit reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  /// assign from compound reference.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_range_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)(val));
+  }
+  //  @}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  INLINE
+  ap_concat_ref<_AP_W, ap_range_ref, _AP_W, ap_int_base<_AP_W, _AP_S> >
+  operator,(ap_int_base<_AP_W, _AP_S>& a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W,
+                         ap_int_base<_AP_W, _AP_S> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_range_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> a2) {
+    return ap_concat_ref<
+        _AP_W, ap_range_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_range_ref, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop == hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop < hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop <= hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<=(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V |= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V |= op2.V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V &= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V &= op2.V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V ^= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V ^= op2.V;
+    return *this;
+  };
+
+  INLINE ap_int_base<_AP_W, false> get() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+  INLINE int to_int() const {
+    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned to_uint() const {
+    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE long to_long() const {
+    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned long to_ulong() const {
+    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_slong to_int64() const {
+    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE bool and_reduce() const {
+    bool ret = true;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret &= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+
+  INLINE bool or_reduce() const {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret |= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+
+  INLINE bool xor_reduce() const {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret ^= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char radix = 2) const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_string(radix);
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(signed char radix = 2) const {
+    return 0;
+  }
+#endif
+}; // struct ap_range_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::ostream& operator<<(std::ostream& os,
+                                const ap_range_ref<_AP_W, _AP_S>& x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.to_string(16); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.to_string(8); // don't print sign
+  } else {
+    os << x.to_string(10);
+  }
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::istream& operator>>(std::istream& in,
+                                ap_range_ref<_AP_W, _AP_S>& op) {
+  std::string str;
+  in >> str;
+  op = ap_int_base<_AP_W, _AP_S>(str.c_str());
+  return in;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/* Bit reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W, bool _AP_S>
+struct ap_bit_ref {
+  // struct ssdm_int or its sim model.
+  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
+  //      and then we can retire af_bit_ref.
+  typedef ap_int_base<_AP_W, _AP_S> ref_type;
+  ref_type& d_bv;
+  int d_index;
+
+ public:
+  // copy ctor
+  INLINE ap_bit_ref(const ap_bit_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {}
+
+  INLINE ap_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
+
+  INLINE ap_bit_ref(const ref_type* bv, int index = 0)
+      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
+
+  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+  INLINE bool to_bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  // assign op from hls supported C integral types.
+  // FIXME disabled to support sc_signal<bool>.
+  // NOTE this used to be unsigned long long.
+  //template <typename T>
+  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                    ap_bit_ref&>::type
+  //operator=(T val) {
+  //  d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
+  //  return *this;
+  //}
+#define ASSIGN_WITH_CTYPE(_Tp)                          \
+  INLINE ap_bit_ref& operator=(_Tp val) {               \
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); \
+    return *this;                                       \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+
+#undef ASSIGN_WITH_CTYPE
+
+#define ASSIGN_WITH_CTYPE_FP(_Tp)                           \
+  INLINE ap_bit_ref& operator=(_Tp val) {                   \
+    bool tmp_val = val;                                     \
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index,tmp_val);  \
+    return *this;                                           \
+  }
+
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE_FP(half)
+#endif
+  ASSIGN_WITH_CTYPE_FP(float)
+  ASSIGN_WITH_CTYPE_FP(double)
+
+#undef ASSIGN_WITH_CTYPE_FP
+
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(val.V != 0));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_int_base<_AP_W2, false>)val);
+  }
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_bit_ref& operator=(const ap_bit_ref& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_bit_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_bit_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_bit_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    ap_int_base<_AP_W2, _AP_S2> op(a2);
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    ap_int_base<_AP_W2, _AP_S2> op(a2);
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
+      const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      1, ap_bit_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        1, ap_bit_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                    _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                      _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    return get() != op.get();
+  }
+
+  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  INLINE bool get() { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    operator=(val);
+  }
+
+  INLINE bool operator~() const {
+    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
+    return bit ? false : true;
+  }
+
+  INLINE int length() const { return 1; }
+
+#ifndef __SYNTHESIS__
+  std::string to_string() const { return get() ? "1" : "0"; }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string() const { return 0; }
+#endif
+}; // struct ap_bit_ref
+
+/* ap_range_ref with int.
+ * ------------------------------------------------------------
+ */
+// equality and relational operators.
+#define REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_range_ref<_AP_W, _AP_S>& op,        \
+                              C_TYPE op2) {                                \
+    return ap_int_base<_AP_W, false>(op)                                   \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W, _AP_S>& op,          \
+                              C_TYPE op2) {                                \
+    return bool(op) REL_OP op2;                                            \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(C_TYPE op2,                                  \
+                              const ap_bit_ref<_AP_W, _AP_S>& op) {        \
+    return op2 REL_OP bool(op);                                            \
+  }                                                                        \
+  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
+  INLINE bool operator REL_OP(                                             \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, C_TYPE op2) { \
+    return ap_int_base<_AP_W + _AP_W1, false>(op)                          \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
+  }
+
+// Make the line shorter than 5000 chars
+#define REF_REL_WITH_INT_1(C_TYPE, _AP_WI, _AP_SI) \
+  REF_REL_OP_WITH_INT(>, C_TYPE, _AP_WI, _AP_SI)   \
+  REF_REL_OP_WITH_INT(<, C_TYPE, _AP_WI, _AP_SI)   \
+  REF_REL_OP_WITH_INT(>=, C_TYPE, _AP_WI, _AP_SI)  \
+  REF_REL_OP_WITH_INT(<=, C_TYPE, _AP_WI, _AP_SI)
+
+REF_REL_WITH_INT_1(bool, 1, false)
+REF_REL_WITH_INT_1(char, 8, CHAR_IS_SIGNED)
+REF_REL_WITH_INT_1(signed char, 8, true)
+REF_REL_WITH_INT_1(unsigned char, 8, false)
+REF_REL_WITH_INT_1(short, _AP_SIZE_short, true)
+REF_REL_WITH_INT_1(unsigned short, _AP_SIZE_short, false)
+REF_REL_WITH_INT_1(int, _AP_SIZE_int, true)
+REF_REL_WITH_INT_1(unsigned int, _AP_SIZE_int, false)
+REF_REL_WITH_INT_1(long, _AP_SIZE_long, true)
+REF_REL_WITH_INT_1(unsigned long, _AP_SIZE_long, false)
+REF_REL_WITH_INT_1(ap_slong, _AP_SIZE_ap_slong, true)
+REF_REL_WITH_INT_1(ap_ulong, _AP_SIZE_ap_slong, false)
+
+// Make the line shorter than 5000 chars
+#define REF_REL_WITH_INT_2(C_TYPE, _AP_WI, _AP_SI) \
+  REF_REL_OP_WITH_INT(==, C_TYPE, _AP_WI, _AP_SI)  \
+  REF_REL_OP_WITH_INT(!=, C_TYPE, _AP_WI, _AP_SI)
+
+REF_REL_WITH_INT_2(bool, 1, false)
+REF_REL_WITH_INT_2(char, 8, CHAR_IS_SIGNED)
+REF_REL_WITH_INT_2(signed char, 8, true)
+REF_REL_WITH_INT_2(unsigned char, 8, false)
+REF_REL_WITH_INT_2(short, _AP_SIZE_short, true)
+REF_REL_WITH_INT_2(unsigned short, _AP_SIZE_short, false)
+REF_REL_WITH_INT_2(int, _AP_SIZE_int, true)
+REF_REL_WITH_INT_2(unsigned int, _AP_SIZE_int, false)
+REF_REL_WITH_INT_2(long, _AP_SIZE_long, true)
+REF_REL_WITH_INT_2(unsigned long, _AP_SIZE_long, false)
+REF_REL_WITH_INT_2(ap_slong, _AP_SIZE_ap_slong, true)
+REF_REL_WITH_INT_2(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_REL_OP_WITH_INT
+#undef REF_REL_WITH_INT_1
+#undef REF_REL_WITH_INT_2
+
+#define REF_BIN_OP_WITH_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)          \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE typename ap_int_base<_AP_W, false>::template RType<_AP_W2,         \
+                                                            _AP_S2>::RTYPE  \
+  operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& op, C_TYPE op2) {       \
+    return ap_int_base<_AP_W, false>(op)                                    \
+        BIN_OP ap_int_base<_AP_W2, _AP_S2>(op2);                            \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE typename ap_int_base<_AP_W2, _AP_S2>::template RType<_AP_W,        \
+                                                              false>::RTYPE \
+  operator BIN_OP(C_TYPE op2, const ap_range_ref<_AP_W, _AP_S>& op) {       \
+    return ap_int_base<_AP_W2, _AP_S2>(op2)                                 \
+        BIN_OP ap_int_base<_AP_W, false>(op);                               \
+  }
+
+// arithmetic operators.
+#define REF_BIN_OP_WITH_INT_ARITH(C_TYPE, _AP_W2, _AP_S2)   \
+  REF_BIN_OP_WITH_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_WITH_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_WITH_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_OP_WITH_INT_ARITH(bool, 1, false)
+REF_BIN_OP_WITH_INT_ARITH(char, 8, CHAR_IS_SIGNED)
+REF_BIN_OP_WITH_INT_ARITH(signed char, 8, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned char, 8, false)
+REF_BIN_OP_WITH_INT_ARITH(short, _AP_SIZE_short, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned short, _AP_SIZE_short, false)
+REF_BIN_OP_WITH_INT_ARITH(int, _AP_SIZE_int, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned int, _AP_SIZE_int, false)
+REF_BIN_OP_WITH_INT_ARITH(long, _AP_SIZE_long, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned long, _AP_SIZE_long, false)
+REF_BIN_OP_WITH_INT_ARITH(ap_slong, _AP_SIZE_ap_slong, true)
+REF_BIN_OP_WITH_INT_ARITH(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_BIN_OP_WITH_INT_ARITH
+
+// bitwise and shift operators
+#define REF_BIN_OP_WITH_INT_BITS(C_TYPE, _AP_W2, _AP_S2)     \
+  REF_BIN_OP_WITH_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_WITH_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_OP_WITH_INT_BITS(bool, 1, false)
+REF_BIN_OP_WITH_INT_BITS(char, 8, CHAR_IS_SIGNED)
+REF_BIN_OP_WITH_INT_BITS(signed char, 8, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned char, 8, false)
+REF_BIN_OP_WITH_INT_BITS(short, _AP_SIZE_short, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned short, _AP_SIZE_short, false)
+REF_BIN_OP_WITH_INT_BITS(int, _AP_SIZE_int, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned int, _AP_SIZE_int, false)
+REF_BIN_OP_WITH_INT_BITS(long, _AP_SIZE_long, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned long, _AP_SIZE_long, false)
+REF_BIN_OP_WITH_INT_BITS(ap_slong, _AP_SIZE_ap_slong, true)
+REF_BIN_OP_WITH_INT_BITS(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_BIN_OP_WITH_INT_BITS
+
+/* ap_range_ref with ap_range_ref
+ *  ------------------------------------------------------------
+ */
+#define REF_BIN_OP(BIN_OP, RTYPE)                                              \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                    \
+  INLINE                                                                       \
+      typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
+      operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& lhs,                   \
+                      const ap_range_ref<_AP_W2, _AP_S2>& rhs) {               \
+    return (lhs.operator ap_int_base<_AP_W, false>())BIN_OP(                   \
+        rhs.operator ap_int_base<_AP_W2, false>());                            \
+  }
+
+REF_BIN_OP(+, plus)
+REF_BIN_OP(-, minus)
+REF_BIN_OP(*, mult)
+REF_BIN_OP(/, div)
+REF_BIN_OP(%, mod)
+REF_BIN_OP(&, logic)
+REF_BIN_OP(|, logic)
+REF_BIN_OP(^, logic)
+REF_BIN_OP(>>, arg1)
+REF_BIN_OP(<<, arg1)
+
+/* ap_concat_ref with ap_concat_ref.
+ *  ------------------------------------------------------------
+ */
+
+//************************************************************************
+//  Implement
+//      ap_int_base<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
+//  for operators  +, -, *, /, %, >>, <<, &, |, ^
+//  Without these operators the operands are converted to int64 and
+//  larger results lose informations (higher order bits).
+//
+//                       operand OP
+//                      /          |
+//              left-concat         right-concat
+//                /     |            /         |
+//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>    <RW2,RT2>
+//
+//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
+//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
+//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
+//
+//  In Verilog 2001 result of concatenation is always unsigned even
+//  when both sides are signed.
+//************************************************************************
+
+#undef SYN_CONCAT_REF_BIN_OP
+
+#define SYN_CONCAT_REF_BIN_OP(BIN_OP, RTYPE)                              \
+  template <int _AP_LW1, typename _AP_LT1, int _AP_LW2, typename _AP_LT2, \
+            int _AP_RW1, typename _AP_RT1, int _AP_RW2, typename _AP_RT2> \
+  INLINE typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType<  \
+      _AP_RW1 + _AP_RW2, false>::RTYPE                                    \
+  operator BIN_OP(                                                        \
+      const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs,       \
+      const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) {     \
+    return lhs.get() BIN_OP rhs.get();                                    \
+  }
+
+SYN_CONCAT_REF_BIN_OP(+, plus)
+SYN_CONCAT_REF_BIN_OP(-, minus)
+SYN_CONCAT_REF_BIN_OP(*, mult)
+SYN_CONCAT_REF_BIN_OP(/, div)
+SYN_CONCAT_REF_BIN_OP(%, mod)
+SYN_CONCAT_REF_BIN_OP(&, logic)
+SYN_CONCAT_REF_BIN_OP(|, logic)
+SYN_CONCAT_REF_BIN_OP(^, logic)
+SYN_CONCAT_REF_BIN_OP(>>, arg1)
+SYN_CONCAT_REF_BIN_OP(<<, arg1)
+
+#undef SYN_CONCAT_REF_BIN_OP
+
+#define CONCAT_OP_WITH_INT(C_TYPE, _AP_WI, _AP_SI)                          \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const ap_int_base<_AP_W, _AP_S> &op1, C_TYPE op2) {                   \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    ret <<= _AP_WI;                                                         \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1, const ap_int_base<_AP_W, _AP_S> &op2) {                   \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    if (_AP_S) {                                                            \
+      ret <<= _AP_WI;                                                       \
+      ret >>= _AP_WI;                                                       \
+    }                                                                       \
+    ret |= val << _AP_W;                                                    \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const ap_range_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    ret <<= _AP_WI;                                                         \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1, const ap_range_ref<_AP_W, _AP_S> &op2) {                  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
+      const ap_bit_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                    \
+    ap_int_base<_AP_WI + 1, false> val(op2);                                \
+    val[_AP_WI] = op1;                                                      \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
+      C_TYPE op1, const ap_bit_ref<_AP_W, _AP_S> &op2) {                    \
+    ap_int_base<_AP_WI + 1, false> val(op1);                                \
+    val <<= 1;                                                              \
+    val[0] = op2;                                                           \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
+  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, C_TYPE op2) { \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op2);                  \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op1);                  \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W + _AP_W2;                                               \
+      val >>= _AP_W + _AP_W2;                                               \
+    }                                                                       \
+    ret <<= _AP_WI;                                                         \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
+  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
+      C_TYPE op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op1);                  \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op2);                  \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,    \
+      C_TYPE op2) {                                                         \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret <<= _AP_WI;                                                         \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1,                                                           \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,      \
+      C_TYPE op2) {                                                         \
+    ap_int_base<_AP_WI + 1, _AP_SI> val(op2);                               \
+    val[_AP_WI] = op1;                                                      \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
+      C_TYPE op1,                                                           \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {    \
+    ap_int_base<_AP_WI + 1, _AP_SI> val(op1);                               \
+    val <<= 1;                                                              \
+    val[0] = op2;                                                           \
+    return val;                                                             \
+  }
+
+CONCAT_OP_WITH_INT(bool, 1, false)
+CONCAT_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
+CONCAT_OP_WITH_INT(signed char, 8, true)
+CONCAT_OP_WITH_INT(unsigned char, 8, false)
+CONCAT_OP_WITH_INT(short, _AP_SIZE_short, true)
+CONCAT_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
+CONCAT_OP_WITH_INT(int, _AP_SIZE_int, true)
+CONCAT_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
+CONCAT_OP_WITH_INT(long, _AP_SIZE_long, true)
+CONCAT_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
+CONCAT_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+CONCAT_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef CONCAT_OP_WITH_INT
+
+#define CONCAT_SHIFT_WITH_INT(C_TYPE, OP)                                  \
+  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
+  INLINE ap_uint<_AP_W + _AP_W1> operator OP(                              \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, C_TYPE rhs) { \
+    return ap_uint<_AP_W + _AP_W1>(lhs).get() OP int(rhs);                 \
+  }
+
+// FIXME int(rhs) may loose precision.
+
+CONCAT_SHIFT_WITH_INT(int, <<)
+CONCAT_SHIFT_WITH_INT(unsigned int, <<)
+CONCAT_SHIFT_WITH_INT(long, <<)
+CONCAT_SHIFT_WITH_INT(unsigned long, <<)
+CONCAT_SHIFT_WITH_INT(ap_slong, <<)
+CONCAT_SHIFT_WITH_INT(ap_ulong, <<)
+
+CONCAT_SHIFT_WITH_INT(int, >>)
+CONCAT_SHIFT_WITH_INT(unsigned int, >>)
+CONCAT_SHIFT_WITH_INT(long, >>)
+CONCAT_SHIFT_WITH_INT(unsigned long, >>)
+CONCAT_SHIFT_WITH_INT(ap_slong, >>)
+CONCAT_SHIFT_WITH_INT(ap_ulong, >>)
+
+#endif // ifndef __cplusplus
+#endif // ifndef __AP_INT_REF_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int_special.h b/hls4ml/templates/vivado/ap_types/ap_int_special.h
index 3afc6192ba..a80a851854 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int_special.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int_special.h
@@ -1,223 +1,223 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_SPECIAL_H__
-#define __AP_INT_SPECIAL_H__
-
-#ifndef __AP_INT_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __SYNTHESIS__
-#include <cstdio>
-#include <cstdlib>
-#endif
-// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
-// include.
-// #include <complex>
-namespace std {
-template<typename _Tp> class complex;
-}
-
-/*
-  TODO: Modernize the code using C++11/C++14
-  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
-  2. move constructor
-*/
-
-namespace std {
-/*
-   Specialize std::complex<ap_int> to zero initialization ap_int.
-
-   To reduce the area cost, ap_int is not zero initialized, just like basic
-   types float or double. However, libstdc++ provides specialization for float,
-   double and long double, initializing image part to 0 when not specified.
-
-   This has become a difficulty in switching legacy code from these C types to
-   ap_int. To ease the tranform of legacy code, we have to implement
-   specialization of std::complex<> for our type.
-
-   As ap_int is a template, it is impossible to specialize only the methods
-   that causes default initialization of value type in std::complex<>. An
-   explicit full specialization of the template class has to be done, covering
-   all the member functions and operators of std::complex<> as specified
-   in standard 26.2.4 and 26.2.5.
-*/
-template <int _AP_W>
-class complex<ap_int<_AP_W> > {
- public:
-  typedef ap_int<_AP_W> _Tp;
-  typedef _Tp value_type;
-
-  // 26.2.4/1
-  // Constructor without argument
-  // Default initialize, so that in dataflow, the variable is only written once.
-  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
-  // Constructor with ap_int.
-  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
-  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
-      : _M_real(__r), _M_imag(__i) {}
-
-  // Constructor with another complex number
-  template <typename _Up>
-  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
-
-#if __cplusplus >= 201103L
-  const _Tp& real() const { return _M_real; }
-  const _Tp& imag() const { return _M_imag; }
-#else
-  _Tp& real() { return _M_real; }
-  const _Tp& real() const { return _M_real; }
-  _Tp& imag() { return _M_imag; }
-  const _Tp& imag() const { return _M_imag; }
-#endif
-
-  void real(_Tp __val) { _M_real = __val; }
-
-  void imag(_Tp __val) { _M_imag = __val; }
-
-  // Assign this complex number with ap_int.
-  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
-  complex<_Tp> &operator=(const _Tp __t) {
-    _M_real = __t;
-    _M_imag = _Tp(0);
-    return *this;
-  }
-
-  // 26.2.5/1
-  // Add ap_int to this complex number.
-  complex<_Tp> &operator+=(const _Tp &__t) {
-    _M_real += __t;
-    return *this;
-  }
-
-  // 26.2.5/3
-  // Subtract ap_int from this complex number.
-  complex<_Tp> &operator-=(const _Tp &__t) {
-    _M_real -= __t;
-    return *this;
-  }
-
-  // 26.2.5/5
-  // Multiply this complex number by ap_int.
-  complex<_Tp> &operator*=(const _Tp &__t) {
-    _M_real *= __t;
-    _M_imag *= __t;
-    return *this;
-  }
-
-  // 26.2.5/7
-  // Divide this complex number by ap_int.
-  complex<_Tp> &operator/=(const _Tp &__t) {
-    _M_real /= __t;
-    _M_imag /= __t;
-    return *this;
-  }
-
-  // Assign complex number to this complex number.
-  template <typename _Up>
-  complex<_Tp> &operator=(const complex<_Up> &__z) {
-    _M_real = __z.real();
-    _M_imag = __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/9
-  // Add complex number to this.
-  template <typename _Up>
-  complex<_Tp> &operator+=(const complex<_Up> &__z) {
-    _M_real += __z.real();
-    _M_imag += __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/11
-  // Subtract complex number from this.
-  template <typename _Up>
-  complex<_Tp> &operator-=(const complex<_Up> &__z) {
-    _M_real -= __z.real();
-    _M_imag -= __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/13
-  // Multiply this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator*=(const complex<_Up> &__z) {
-    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
-    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
-    _M_real = __r;
-    return *this;
-  }
-
-  // 26.2.5/15
-  // Divide this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator/=(const complex<_Up> &__z) {
-    complex<_Tp> cj (__z.real(), -__z.imag());
-    complex<_Tp> a = (*this) * cj;
-    complex<_Tp> b = cj * __z;
-    _M_real = a.real() / b.real();
-    _M_imag = a.imag() / b.real();
-    return *this;
-  }
-
- private:
-  _Tp _M_real;
-  _Tp _M_imag;
-
-}; // class complex<ap_int<_AP_W> >
-
-
-/*
-   Non-member operations
-   These operations are not required by standard in 26.2.6, but libstdc++
-   defines them for
-   float, double or long double's specialization.
-*/
-// Compare complex number with ap_int.
-template <int _AP_W>
-inline bool operator==(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
-  return __x.real() == __y &&
-         __x.imag() == 0;
-}
-
-// Compare ap_int with complex number.
-template <int _AP_W>
-inline bool operator==(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
-  return __x == __y.real() &&
-         0 == __y.imag();
-}
-
-// Compare complex number with ap_int.
-template <int _AP_W>
-inline bool operator!=(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
-  return __x.real() != __y ||
-         __x.imag() != 0;
-}
-
-// Compare ap_int with complex number.
-template <int _AP_W>
-inline bool operator!=(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
-  return __x != __y.real() ||
-         0 != __y.imag();
-}
-
-}  // namespace std
-
-#endif  // ifndef __AP_INT_SPECIAL_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_SPECIAL_H__
+#define __AP_INT_SPECIAL_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __SYNTHESIS__
+#include <cstdio>
+#include <cstdlib>
+#endif
+// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
+// include.
+// #include <complex>
+namespace std {
+template<typename _Tp> class complex;
+}
+
+/*
+  TODO: Modernize the code using C++11/C++14
+  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
+  2. move constructor
+*/
+
+namespace std {
+/*
+   Specialize std::complex<ap_int> to zero initialization ap_int.
+
+   To reduce the area cost, ap_int is not zero initialized, just like basic
+   types float or double. However, libstdc++ provides specialization for float,
+   double and long double, initializing image part to 0 when not specified.
+
+   This has become a difficulty in switching legacy code from these C types to
+   ap_int. To ease the tranform of legacy code, we have to implement
+   specialization of std::complex<> for our type.
+
+   As ap_int is a template, it is impossible to specialize only the methods
+   that causes default initialization of value type in std::complex<>. An
+   explicit full specialization of the template class has to be done, covering
+   all the member functions and operators of std::complex<> as specified
+   in standard 26.2.4 and 26.2.5.
+*/
+template <int _AP_W>
+class complex<ap_int<_AP_W> > {
+ public:
+  typedef ap_int<_AP_W> _Tp;
+  typedef _Tp value_type;
+
+  // 26.2.4/1
+  // Constructor without argument
+  // Default initialize, so that in dataflow, the variable is only written once.
+  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
+  // Constructor with ap_int.
+  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
+  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
+      : _M_real(__r), _M_imag(__i) {}
+
+  // Constructor with another complex number
+  template <typename _Up>
+  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
+
+#if __cplusplus >= 201103L
+  const _Tp& real() const { return _M_real; }
+  const _Tp& imag() const { return _M_imag; }
+#else
+  _Tp& real() { return _M_real; }
+  const _Tp& real() const { return _M_real; }
+  _Tp& imag() { return _M_imag; }
+  const _Tp& imag() const { return _M_imag; }
+#endif
+
+  void real(_Tp __val) { _M_real = __val; }
+
+  void imag(_Tp __val) { _M_imag = __val; }
+
+  // Assign this complex number with ap_int.
+  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
+  complex<_Tp> &operator=(const _Tp __t) {
+    _M_real = __t;
+    _M_imag = _Tp(0);
+    return *this;
+  }
+
+  // 26.2.5/1
+  // Add ap_int to this complex number.
+  complex<_Tp> &operator+=(const _Tp &__t) {
+    _M_real += __t;
+    return *this;
+  }
+
+  // 26.2.5/3
+  // Subtract ap_int from this complex number.
+  complex<_Tp> &operator-=(const _Tp &__t) {
+    _M_real -= __t;
+    return *this;
+  }
+
+  // 26.2.5/5
+  // Multiply this complex number by ap_int.
+  complex<_Tp> &operator*=(const _Tp &__t) {
+    _M_real *= __t;
+    _M_imag *= __t;
+    return *this;
+  }
+
+  // 26.2.5/7
+  // Divide this complex number by ap_int.
+  complex<_Tp> &operator/=(const _Tp &__t) {
+    _M_real /= __t;
+    _M_imag /= __t;
+    return *this;
+  }
+
+  // Assign complex number to this complex number.
+  template <typename _Up>
+  complex<_Tp> &operator=(const complex<_Up> &__z) {
+    _M_real = __z.real();
+    _M_imag = __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/9
+  // Add complex number to this.
+  template <typename _Up>
+  complex<_Tp> &operator+=(const complex<_Up> &__z) {
+    _M_real += __z.real();
+    _M_imag += __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/11
+  // Subtract complex number from this.
+  template <typename _Up>
+  complex<_Tp> &operator-=(const complex<_Up> &__z) {
+    _M_real -= __z.real();
+    _M_imag -= __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/13
+  // Multiply this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator*=(const complex<_Up> &__z) {
+    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
+    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
+    _M_real = __r;
+    return *this;
+  }
+
+  // 26.2.5/15
+  // Divide this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator/=(const complex<_Up> &__z) {
+    complex<_Tp> cj (__z.real(), -__z.imag());
+    complex<_Tp> a = (*this) * cj;
+    complex<_Tp> b = cj * __z;
+    _M_real = a.real() / b.real();
+    _M_imag = a.imag() / b.real();
+    return *this;
+  }
+
+ private:
+  _Tp _M_real;
+  _Tp _M_imag;
+
+}; // class complex<ap_int<_AP_W> >
+
+
+/*
+   Non-member operations
+   These operations are not required by standard in 26.2.6, but libstdc++
+   defines them for
+   float, double or long double's specialization.
+*/
+// Compare complex number with ap_int.
+template <int _AP_W>
+inline bool operator==(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
+  return __x.real() == __y &&
+         __x.imag() == 0;
+}
+
+// Compare ap_int with complex number.
+template <int _AP_W>
+inline bool operator==(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
+  return __x == __y.real() &&
+         0 == __y.imag();
+}
+
+// Compare complex number with ap_int.
+template <int _AP_W>
+inline bool operator!=(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
+  return __x.real() != __y ||
+         __x.imag() != 0;
+}
+
+// Compare ap_int with complex number.
+template <int _AP_W>
+inline bool operator!=(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
+  return __x != __y.real() ||
+         0 != __y.imag();
+}
+
+}  // namespace std
+
+#endif  // ifndef __AP_INT_SPECIAL_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_shift_reg.h b/hls4ml/templates/vivado/ap_types/ap_shift_reg.h
index 94dba51e46..1539ba5e61 100644
--- a/hls4ml/templates/vivado/ap_types/ap_shift_reg.h
+++ b/hls4ml/templates/vivado/ap_types/ap_shift_reg.h
@@ -1,138 +1,138 @@
-/*
-#-  (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved.
-#-
-#-  This file contains confidential and proprietary information
-#-  of Xilinx, Inc. and is protected under U.S. and
-#-  international copyright and other intellectual property
-#-  laws.
-#-
-#-  DISCLAIMER
-#-  This disclaimer is not a license and does not grant any
-#-  rights to the materials distributed herewith. Except as
-#-  otherwise provided in a valid license issued to you by
-#-  Xilinx, and to the maximum extent permitted by applicable
-#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
-#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
-#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
-#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
-#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
-#-  (2) Xilinx shall not be liable (whether in contract or tort,
-#-  including negligence, or under any other theory of
-#-  liability) for any loss or damage of any kind or nature
-#-  related to, arising under or in connection with these
-#-  materials, including for any direct, or any indirect,
-#-  special, incidental, or consequential loss or damage
-#-  (including loss of data, profits, goodwill, or any type of
-#-  loss or damage suffered as a result of any action brought
-#-  by a third party) even if such damage or loss was
-#-  reasonably foreseeable or Xilinx had been advised of the
-#-  possibility of the same.
-#-
-#-  CRITICAL APPLICATIONS
-#-  Xilinx products are not designed or intended to be fail-
-#-  safe, or for use in any application requiring fail-safe
-#-  performance, such as life-support or safety devices or
-#-  systems, Class III medical devices, nuclear facilities,
-#-  applications related to the deployment of airbags, or any
-#-  other applications that could lead to death, personal
-#-  injury, or severe property or environmental damage
-#-  (individually and collectively, "Critical
-#-  Applications"). Customer assumes the sole risk and
-#-  liability of any use of Xilinx products in Critical
-#-  Applications, subject only to applicable laws and
-#-  regulations governing limitations on product liability.
-#-
-#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
-#-  PART OF THIS FILE AT ALL TIMES. 
-#- ************************************************************************
-
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#ifndef __SIM_AP_SHIFT_REG_H__
-#define __SIM_AP_SHIFT_REG_H__
-
-
-/*
- * This file contains a C++ model of shift register.
- * It defines C level simulation model.
- */
-#ifndef __cplusplus
-#error C++ is required to include this header file
-#else
-
-#include <cassert>
-
-//////////////////////////////////////////////
-// C level simulation model for ap_shift_reg
-//////////////////////////////////////////////
-template<typename __SHIFT_T__, unsigned int __SHIFT_DEPTH__ = 32>
-class ap_shift_reg
-{
-  public:
-    /// Constructors
-    ap_shift_reg() { }
-    ap_shift_reg(const char* name) { }
-    /// Destructor
-    virtual ~ap_shift_reg() { }
-
-  private:
-    /// Make copy constructor and assignment operator private
-    ap_shift_reg(const ap_shift_reg< __SHIFT_T__, __SHIFT_DEPTH__ >& shreg)
-    {
-        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
-            Array[i] = shreg.Array[i];
-    }
-
-    ap_shift_reg& operator = (const ap_shift_reg< __SHIFT_T__,
-        __SHIFT_DEPTH__ >& shreg)
-    {
-        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
-            Array[i] = shreg.Array[i];
-        return *this;
-    }
-
-  public:
-    // Shift the queue, push to back and read from a given address.
-    __SHIFT_T__ shift(__SHIFT_T__ DataIn,
-        unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true)
-    {
-        assert(Addr < __SHIFT_DEPTH__ &&
-            "Out-of-bound shift is found in ap_shift_reg.");
-        __SHIFT_T__ ret = Array[Addr];
-        if (Enable) {
-            for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i)
-                Array[i] = Array[i-1];
-            Array[0] = DataIn;
-        }
-        return ret;
-    }
-
-    // Read from a given address.
-    __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const
-    {
-        assert(Addr < __SHIFT_DEPTH__ &&
-            "Out-of-bound read is found in ap_shift_reg.");
-        return Array[Addr];
-    }
-
-  protected:
-    __SHIFT_T__ Array[__SHIFT_DEPTH__];
-};
-
-#endif //__cplusplus
-
-#endif //__SIM_AP_SHIFT_REG_H__
-
-
+/*
+#-  (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __SIM_AP_SHIFT_REG_H__
+#define __SIM_AP_SHIFT_REG_H__
+
+
+/*
+ * This file contains a C++ model of shift register.
+ * It defines C level simulation model.
+ */
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#else
+
+#include <cassert>
+
+//////////////////////////////////////////////
+// C level simulation model for ap_shift_reg
+//////////////////////////////////////////////
+template<typename __SHIFT_T__, unsigned int __SHIFT_DEPTH__ = 32>
+class ap_shift_reg
+{
+  public:
+    /// Constructors
+    ap_shift_reg() { }
+    ap_shift_reg(const char* name) { }
+    /// Destructor
+    virtual ~ap_shift_reg() { }
+
+  private:
+    /// Make copy constructor and assignment operator private
+    ap_shift_reg(const ap_shift_reg< __SHIFT_T__, __SHIFT_DEPTH__ >& shreg)
+    {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+    }
+
+    ap_shift_reg& operator = (const ap_shift_reg< __SHIFT_T__,
+        __SHIFT_DEPTH__ >& shreg)
+    {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+        return *this;
+    }
+
+  public:
+    // Shift the queue, push to back and read from a given address.
+    __SHIFT_T__ shift(__SHIFT_T__ DataIn,
+        unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true)
+    {
+        assert(Addr < __SHIFT_DEPTH__ &&
+            "Out-of-bound shift is found in ap_shift_reg.");
+        __SHIFT_T__ ret = Array[Addr];
+        if (Enable) {
+            for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i)
+                Array[i] = Array[i-1];
+            Array[0] = DataIn;
+        }
+        return ret;
+    }
+
+    // Read from a given address.
+    __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const
+    {
+        assert(Addr < __SHIFT_DEPTH__ &&
+            "Out-of-bound read is found in ap_shift_reg.");
+        return Array[Addr];
+    }
+
+  protected:
+    __SHIFT_T__ Array[__SHIFT_DEPTH__];
+};
+
+#endif //__cplusplus
+
+#endif //__SIM_AP_SHIFT_REG_H__
+
+
diff --git a/hls4ml/templates/vivado/ap_types/etc/ap_private.h b/hls4ml/templates/vivado/ap_types/etc/ap_private.h
index 0c29a0ac1a..7af898d2c5 100644
--- a/hls4ml/templates/vivado/ap_types/etc/ap_private.h
+++ b/hls4ml/templates/vivado/ap_types/etc/ap_private.h
@@ -1,7199 +1,7199 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_PRIVATE_H__
-#define __AP_PRIVATE_H__
-
-// common macros and type declarations are now defined in ap_common.h, and
-// ap_private becomes part of it.
-#ifndef __AP_COMMON_H__
-#error "etc/ap_private.h cannot be included directly."
-#endif
-
-// forward declarations
-//template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
-//class ap_private; // moved to ap_common.h
-template <int _AP_W, bool _AP_S>
-struct _private_range_ref;
-template <int _AP_W, bool _AP_S>
-struct _private_bit_ref;
-
-// TODO clean up this part.
-#ifndef LLVM_SUPPORT_MATHEXTRAS_H
-#define LLVM_SUPPORT_MATHEXTRAS_H
-
-#ifdef _MSC_VER
-#if _MSC_VER <= 1500
-typedef __int8 int8_t;
-typedef unsigned __int8 uint8_t;
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#else
-#include <stdint.h>
-#endif
-#else
-#include <stdint.h>
-#endif
-
-#ifndef INLINE
-#define INLINE inline
-// Enable to debug ap_int/ap_fixed
-// #define INLINE  __attribute__((weak))
-#endif
-
-// NOTE: The following support functions use the _32/_64 extensions instead of
-// type overloading so that signed and unsigned integers can be used without
-// ambiguity.
-namespace AESL_std {
-template <class DataType>
-DataType INLINE min(DataType a, DataType b) {
-  return (a >= b) ? b : a;
-}
-
-template <class DataType>
-DataType INLINE max(DataType a, DataType b) {
-  return (a >= b) ? a : b;
-}
-} // namespace AESL_std
-
-// TODO clean up included headers.
-#include <math.h>
-#include <stdio.h>
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <limits>
-#include <sstream>
-#include <string>
-
-namespace ap_private_ops {
-/// Hi_32 - This function returns the high 32 bits of a 64 bit value.
-static INLINE uint32_t Hi_32(uint64_t Value) {
-  return static_cast<uint32_t>(Value >> 32);
-}
-
-/// Lo_32 - This function returns the low 32 bits of a 64 bit value.
-static INLINE uint32_t Lo_32(uint64_t Value) {
-  return static_cast<uint32_t>(Value);
-}
-
-template <int _AP_W>
-INLINE bool isNegative(const ap_private<_AP_W, false>& a) {
-  return false;
-}
-
-template <int _AP_W>
-INLINE bool isNegative(const ap_private<_AP_W, true>& a) {
-  enum {
-    APINT_BITS_PER_WORD = 64,
-    _AP_N = (_AP_W + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD
-  };
-  static const uint64_t sign_mask = 1ULL << ((_AP_W - 1) % APINT_BITS_PER_WORD);
-  return (sign_mask & a.get_pVal(_AP_N - 1)) != 0;
-}
-
-/// CountLeadingZeros_32 - this function performs the platform optimal form of
-/// counting the number of zeros from the most significant bit to the first one
-/// bit.  Ex. CountLeadingZeros_32(0x00F000FF) == 8.
-/// Returns 32 if the word is zero.
-static INLINE unsigned CountLeadingZeros_32(uint32_t Value) {
-  unsigned Count; // result
-#if __GNUC__ >= 4
-// PowerPC is defined for __builtin_clz(0)
-#if !defined(__ppc__) && !defined(__ppc64__)
-  if (Value == 0) return 32;
-#endif
-  Count = __builtin_clz(Value);
-#else
-  if (Value == 0) return 32;
-  Count = 0;
-  // bisecton method for count leading zeros
-  for (unsigned Shift = 32 >> 1; Shift; Shift >>= 1) {
-    uint32_t Tmp = (Value) >> (Shift);
-    if (Tmp) {
-      Value = Tmp;
-    } else {
-      Count |= Shift;
-    }
-  }
-#endif
-  return Count;
-}
-
-/// CountLeadingZeros_64 - This function performs the platform optimal form
-/// of counting the number of zeros from the most significant bit to the first
-/// one bit (64 bit edition.)
-/// Returns 64 if the word is zero.
-static INLINE unsigned CountLeadingZeros_64(uint64_t Value) {
-  unsigned Count; // result
-#if __GNUC__ >= 4
-// PowerPC is defined for __builtin_clzll(0)
-#if !defined(__ppc__) && !defined(__ppc64__)
-  if (!Value) return 64;
-#endif
-  Count = __builtin_clzll(Value);
-#else
-  if (sizeof(long) == sizeof(int64_t)) {
-    if (!Value) return 64;
-    Count = 0;
-    // bisecton method for count leading zeros
-    for (unsigned Shift = 64 >> 1; Shift; Shift >>= 1) {
-      uint64_t Tmp = (Value) >> (Shift);
-      if (Tmp) {
-        Value = Tmp;
-      } else {
-        Count |= Shift;
-      }
-    }
-  } else {
-    // get hi portion
-    uint32_t Hi = Hi_32(Value);
-
-    // if some bits in hi portion
-    if (Hi) {
-      // leading zeros in hi portion plus all bits in lo portion
-      Count = CountLeadingZeros_32(Hi);
-    } else {
-      // get lo portion
-      uint32_t Lo = Lo_32(Value);
-      // same as 32 bit value
-      Count = CountLeadingZeros_32(Lo) + 32;
-    }
-  }
-#endif
-  return Count;
-}
-
-/// CountTrailingZeros_64 - This function performs the platform optimal form
-/// of counting the number of zeros from the least significant bit to the first
-/// one bit (64 bit edition.)
-/// Returns 64 if the word is zero.
-static INLINE unsigned CountTrailingZeros_64(uint64_t Value) {
-#if __GNUC__ >= 4
-  return (Value != 0) ? __builtin_ctzll(Value) : 64;
-#else
-  static const unsigned Mod67Position[] = {
-      64, 0,  1,  39, 2,  15, 40, 23, 3,  12, 16, 59, 41, 19, 24, 54, 4,
-      64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55, 47, 5,  32,
-      65, 38, 14, 22, 11, 58, 18, 53, 63, 9,  61, 27, 29, 50, 43, 46, 31,
-      37, 21, 57, 52, 8,  26, 49, 45, 36, 56, 7,  48, 35, 6,  34, 33, 0};
-  return Mod67Position[(uint64_t)(-(int64_t)Value & (int64_t)Value) % 67];
-#endif
-}
-
-/// CountPopulation_64 - this function counts the number of set bits in a value,
-/// (64 bit edition.)
-static INLINE unsigned CountPopulation_64(uint64_t Value) {
-#if __GNUC__ >= 4
-  return __builtin_popcountll(Value);
-#else
-  uint64_t v = Value - (((Value) >> 1) & 0x5555555555555555ULL);
-  v = (v & 0x3333333333333333ULL) + (((v) >> 2) & 0x3333333333333333ULL);
-  v = (v + ((v) >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-  return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
-#endif
-}
-
-static INLINE uint32_t countLeadingOnes_64(uint64_t __V, uint32_t skip) {
-  uint32_t Count = 0;
-  if (skip) (__V) <<= (skip);
-  while (__V && (__V & (1ULL << 63))) {
-    Count++;
-    (__V) <<= 1;
-  }
-  return Count;
-}
-
-static INLINE std::string oct2Bin(char oct) {
-  switch (oct) {
-    case '\0': {
-      return "";
-    }
-    case '.': {
-      return ".";
-    }
-    case '0': {
-      return "000";
-    }
-    case '1': {
-      return "001";
-    }
-    case '2': {
-      return "010";
-    }
-    case '3': {
-      return "011";
-    }
-    case '4': {
-      return "100";
-    }
-    case '5': {
-      return "101";
-    }
-    case '6': {
-      return "110";
-    }
-    case '7': {
-      return "111";
-    }
-  }
-  assert(0 && "Invalid character in digit string");
-  return "";
-}
-
-static INLINE std::string hex2Bin(char hex) {
-  switch (hex) {
-    case '\0': {
-      return "";
-    }
-    case '.': {
-      return ".";
-    }
-    case '0': {
-      return "0000";
-    }
-    case '1': {
-      return "0001";
-    }
-    case '2': {
-      return "0010";
-    }
-    case '3': {
-      return "0011";
-    }
-    case '4': {
-      return "0100";
-    }
-    case '5': {
-      return "0101";
-    }
-    case '6': {
-      return "0110";
-    }
-    case '7': {
-      return "0111";
-    }
-    case '8': {
-      return "1000";
-    }
-    case '9': {
-      return "1001";
-    }
-    case 'A':
-    case 'a': {
-      return "1010";
-    }
-    case 'B':
-    case 'b': {
-      return "1011";
-    }
-    case 'C':
-    case 'c': {
-      return "1100";
-    }
-    case 'D':
-    case 'd': {
-      return "1101";
-    }
-    case 'E':
-    case 'e': {
-      return "1110";
-    }
-    case 'F':
-    case 'f': {
-      return "1111";
-    }
-  }
-  assert(0 && "Invalid character in digit string");
-  return "";
-}
-
-static INLINE uint32_t decode_digit(char cdigit, int radix) {
-  uint32_t digit = 0;
-  if (radix == 16) {
-#define isxdigit(c)                                            \
-  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
-   ((c) >= 'A' && (c) <= 'F'))
-#define isdigit(c) ((c) >= '0' && (c) <= '9')
-    if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
-    if (isdigit(cdigit))
-      digit = cdigit - '0';
-    else if (cdigit >= 'a')
-      digit = cdigit - 'a' + 10;
-    else if (cdigit >= 'A')
-      digit = cdigit - 'A' + 10;
-    else
-      assert(0 && "huh? we shouldn't get here");
-  } else if (isdigit(cdigit)) {
-    digit = cdigit - '0';
-  } else {
-    assert(0 && "Invalid character in digit string");
-  }
-#undef isxdigit
-#undef isdigit
-  return digit;
-}
-
-// Determine the radix of "val".
-static INLINE std::string parseString(const std::string& input, unsigned char& radix) {
-  size_t len = input.length();
-  if (len == 0) {
-    if (radix == 0) radix = 10;
-    return input;
-  }
-
-  size_t startPos = 0;
-  // Trim whitespace
-  while (input[startPos] == ' ' && startPos < len) startPos++;
-  while (input[len - 1] == ' ' && startPos < len) len--;
-
-  std::string val = input.substr(startPos, len - startPos);
-  // std::cout << "val = " << val << "\n";
-  len = val.length();
-  startPos = 0;
-
-  // If the length of the string is less than 2, then radix
-  // is decimal and there is no exponent.
-  if (len < 2) {
-    if (radix == 0) radix = 10;
-    return val;
-  }
-
-  bool isNegative = false;
-  std::string ans;
-
-  // First check to see if we start with a sign indicator
-  if (val[0] == '-') {
-    ans = "-";
-    ++startPos;
-    isNegative = true;
-  } else if (val[0] == '+')
-    ++startPos;
-
-  if (len - startPos < 2) {
-    if (radix == 0) radix = 10;
-    return val;
-  }
-
-  if (val.substr(startPos, 2) == "0x" || val.substr(startPos, 2) == "0X") {
-    // If we start with "0x", then the radix is hex.
-    radix = 16;
-    startPos += 2;
-  } else if (val.substr(startPos, 2) == "0b" ||
-             val.substr(startPos, 2) == "0B") {
-    // If we start with "0b", then the radix is binary.
-    radix = 2;
-    startPos += 2;
-  } else if (val.substr(startPos, 2) == "0o" ||
-             val.substr(startPos, 2) == "0O") {
-    // If we start with "0o", then the radix is octal.
-    radix = 8;
-    startPos += 2;
-  } else if (radix == 0) {
-    radix = 10;
-  }
-
-  int exp = 0;
-  if (radix == 10) {
-    // If radix is decimal, then see if there is an
-    // exponent indicator.
-    size_t expPos = val.find('e');
-    bool has_exponent = true;
-    if (expPos == std::string::npos) expPos = val.find('E');
-    if (expPos == std::string::npos) {
-      // No exponent indicator, so the mantissa goes to the end.
-      expPos = len;
-      has_exponent = false;
-    }
-    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
-
-    ans += val.substr(startPos, expPos - startPos);
-    if (has_exponent) {
-      // Parse the exponent.
-      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
-      iss >> exp;
-    }
-  } else {
-    // Check for a binary exponent indicator.
-    size_t expPos = val.find('p');
-    bool has_exponent = true;
-    if (expPos == std::string::npos) expPos = val.find('P');
-    if (expPos == std::string::npos) {
-      // No exponent indicator, so the mantissa goes to the end.
-      expPos = len;
-      has_exponent = false;
-    }
-
-    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
-
-    assert(startPos <= expPos);
-    // Convert to binary as we go.
-    for (size_t i = startPos; i < expPos; ++i) {
-      if (radix == 16) {
-        ans += hex2Bin(val[i]);
-      } else if (radix == 8) {
-        ans += oct2Bin(val[i]);
-      } else { // radix == 2
-        ans += val[i];
-      }
-    }
-    // End in binary
-    radix = 2;
-    if (has_exponent) {
-      // Parse the exponent.
-      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
-      iss >> exp;
-    }
-  }
-  if (exp == 0) return ans;
-
-  size_t decPos = ans.find('.');
-  if (decPos == std::string::npos) decPos = ans.length();
-  if ((int)decPos + exp >= (int)ans.length()) {
-    int i = decPos;
-    for (; i < (int)ans.length() - 1; ++i) ans[i] = ans[i + 1];
-    for (; i < (int)ans.length(); ++i) ans[i] = '0';
-    for (; i < (int)decPos + exp; ++i) ans += '0';
-    return ans;
-  } else if ((int)decPos + exp < (int)isNegative) {
-    std::string dupAns = "0.";
-    if (ans[0] == '-') dupAns = "-0.";
-    for (int i = 0; i < isNegative - (int)decPos - exp; ++i) dupAns += '0';
-    for (size_t i = isNegative; i < ans.length(); ++i)
-      if (ans[i] != '.') dupAns += ans[i];
-    return dupAns;
-  }
-
-  if (exp > 0)
-    for (size_t i = decPos; i < decPos + exp; ++i) ans[i] = ans[i + 1];
-  else {
-    if (decPos == ans.length()) ans += ' ';
-    for (int i = decPos; i > (int)decPos + exp; --i) ans[i] = ans[i - 1];
-  }
-  ans[decPos + exp] = '.';
-  return ans;
-}
-
-/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from
-/// the multi-digit integer array, x[], propagating the borrowed 1 value until
-/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
-/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
-/// In other words, if y > x then this function returns 1, otherwise 0.
-/// @returns the borrow out of the subtraction
-static INLINE bool sub_1(uint64_t x[], uint32_t len, uint64_t y) {
-  for (uint32_t i = 0; i < len; ++i) {
-    uint64_t __X = x[i];
-    x[i] -= y;
-    if (y > __X)
-      y = 1; // We have to "borrow 1" from next "digit"
-    else {
-      y = 0; // No need to borrow
-      break; // Remaining digits are unchanged so exit early
-    }
-  }
-  return (y != 0);
-}
-
-/// add_1 - This function adds a single "digit" integer, y, to the multiple
-/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
-/// 1 is returned if there is a carry out, otherwise 0 is returned.
-/// @returns the carry of the addition.
-static INLINE bool add_1(uint64_t dest[], uint64_t x[], uint32_t len,
-                         uint64_t y) {
-  for (uint32_t i = 0; i < len; ++i) {
-    dest[i] = y + x[i];
-    if (dest[i] < y)
-      y = 1; // Carry one to next digit.
-    else {
-      y = 0; // No need to carry so exit early
-      break;
-    }
-  }
-  return (y != 0);
-}
-
-/// add - This function adds the integer array x to the integer array Y and
-/// places the result in dest.
-/// @returns the carry out from the addition
-/// @brief General addition of 64-bit integer arrays
-static INLINE bool add(uint64_t* dest, const uint64_t* x, const uint64_t* y,
-                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
-                       bool xsigned, bool ysigned) {
-  bool carry = false;
-  uint32_t len = AESL_std::min(xlen, ylen);
-  uint32_t i;
-  for (i = 0; i < len && i < destlen; ++i) {
-    uint64_t limit =
-        AESL_std::min(x[i], y[i]); // must come first in case dest == x
-    dest[i] = x[i] + y[i] + carry;
-    carry = dest[i] < limit || (carry && dest[i] == limit);
-  }
-  if (xlen > ylen) {
-    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
-    for (i = ylen; i < xlen && i < destlen; i++) {
-      uint64_t limit = AESL_std::min(x[i], yext);
-      dest[i] = x[i] + yext + carry;
-      carry = (dest[i] < limit) || (carry && dest[i] == limit);
-    }
-  } else if (ylen > xlen) {
-    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
-    for (i = xlen; i < ylen && i < destlen; i++) {
-      uint64_t limit = AESL_std::min(xext, y[i]);
-      dest[i] = xext + y[i] + carry;
-      carry = (dest[i] < limit) || (carry && dest[i] == limit);
-    }
-  }
-  return carry;
-}
-
-/// @returns returns the borrow out.
-/// @brief Generalized subtraction of 64-bit integer arrays.
-static INLINE bool sub(uint64_t* dest, const uint64_t* x, const uint64_t* y,
-                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
-                       bool xsigned, bool ysigned) {
-  bool borrow = false;
-  uint32_t i;
-  uint32_t len = AESL_std::min(xlen, ylen);
-  for (i = 0; i < len && i < destlen; ++i) {
-    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
-    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
-    dest[i] = x_tmp - y[i];
-  }
-  if (xlen > ylen) {
-    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
-    for (i = ylen; i < xlen && i < destlen; i++) {
-      uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
-      borrow = yext > x_tmp || (borrow && x[i] == 0);
-      dest[i] = x_tmp - yext;
-    }
-  } else if (ylen > xlen) {
-    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
-    for (i = xlen; i < ylen && i < destlen; i++) {
-      uint64_t x_tmp = borrow ? xext - 1 : xext;
-      borrow = y[i] > x_tmp || (borrow && xext == 0);
-      dest[i] = x_tmp - y[i];
-    }
-  }
-  return borrow;
-}
-
-/// Subtracts the RHS ap_private from this ap_private
-/// @returns this, after subtraction
-/// @brief Subtraction assignment operator.
-
-/// Multiplies an integer array, x by a a uint64_t integer and places the result
-/// into dest.
-/// @returns the carry out of the multiplication.
-/// @brief Multiply a multi-digit ap_private by a single digit (64-bit) integer.
-static INLINE uint64_t mul_1(uint64_t dest[], const uint64_t x[], uint32_t len,
-                             uint64_t y) {
-  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
-  uint64_t ly = y & 0xffffffffULL, hy = (y) >> 32;
-  uint64_t carry = 0;
-  static const uint64_t two_power_32 = 1ULL << 32;
-  // For each digit of x.
-  for (uint32_t i = 0; i < len; ++i) {
-    // Split x into high and low words
-    uint64_t lx = x[i] & 0xffffffffULL;
-    uint64_t hx = (x[i]) >> 32;
-    // hasCarry - A flag to indicate if there is a carry to the next digit.
-    // hasCarry == 0, no carry
-    // hasCarry == 1, has carry
-    // hasCarry == 2, no carry and the calculation result == 0.
-    uint8_t hasCarry = 0;
-    dest[i] = carry + lx * ly;
-    // Determine if the add above introduces carry.
-    hasCarry = (dest[i] < carry) ? 1 : 0;
-    carry = hx * ly + ((dest[i]) >> 32) + (hasCarry ? two_power_32 : 0);
-    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
-    // (2^32 - 1) + 2^32 = 2^64.
-    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-
-    carry += (lx * hy) & 0xffffffffULL;
-    dest[i] = ((carry) << 32) | (dest[i] & 0xffffffffULL);
-    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? two_power_32 : 0) +
-            ((carry) >> 32) + ((lx * hy) >> 32) + hx * hy;
-  }
-  return carry;
-}
-
-/// Multiplies integer array x by integer array y and stores the result into
-/// the integer array dest. Note that dest's size must be >= xlen + ylen in
-/// order to
-/// do a full precision computation. If it is not, then only the low-order words
-/// are returned.
-/// @brief Generalized multiplicate of integer arrays.
-static INLINE void mul(uint64_t dest[], const uint64_t x[], uint32_t xlen,
-                       const uint64_t y[], uint32_t ylen, uint32_t destlen) {
-  assert(xlen > 0);
-  assert(ylen > 0);
-  assert(destlen >= xlen + ylen);
-  if (xlen < destlen) dest[xlen] = mul_1(dest, x, xlen, y[0]);
-  for (uint32_t i = 1; i < ylen; ++i) {
-    uint64_t ly = y[i] & 0xffffffffULL, hy = (y[i]) >> 32;
-    uint64_t carry = 0, lx = 0, hx = 0;
-    for (uint32_t j = 0; j < xlen; ++j) {
-      lx = x[j] & 0xffffffffULL;
-      hx = (x[j]) >> 32;
-      // hasCarry - A flag to indicate if has carry.
-      // hasCarry == 0, no carry
-      // hasCarry == 1, has carry
-      // hasCarry == 2, no carry and the calculation result == 0.
-      uint8_t hasCarry = 0;
-      uint64_t resul = carry + lx * ly;
-      hasCarry = (resul < carry) ? 1 : 0;
-      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + ((resul) >> 32);
-      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-      carry += (lx * hy) & 0xffffffffULL;
-      resul = ((carry) << 32) | (resul & 0xffffffffULL);
-      if (i + j < destlen) dest[i + j] += resul;
-      carry =
-          (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
-          ((carry) >> 32) + (dest[i + j] < resul ? 1 : 0) + ((lx * hy) >> 32) +
-          hx * hy;
-    }
-    if (i + xlen < destlen) dest[i + xlen] = carry;
-  }
-}
-
-/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
-/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
-/// variables here have the same names as in the algorithm. Comments explain
-/// the algorithm and any deviation from it.
-static INLINE void KnuthDiv(uint32_t* u, uint32_t* v, uint32_t* q, uint32_t* r,
-                            uint32_t m, uint32_t n) {
-  assert(u && "Must provide dividend");
-  assert(v && "Must provide divisor");
-  assert(q && "Must provide quotient");
-  assert(u != v && u != q && v != q && "Must us different memory");
-  assert(n > 1 && "n must be > 1");
-
-  // Knuth uses the value b as the base of the number system. In our case b
-  // is 2^31 so we just set it to -1u.
-  uint64_t b = uint64_t(1) << 32;
-
-  // DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n');
-  // DEBUG(cerr << "KnuthDiv: original:");
-  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
-  // u[i]);
-  // DEBUG(cerr << " by");
-  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
-  // v[i-1]);
-  // DEBUG(cerr << '\n');
-  // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
-  // u and v by d. Note that we have taken Knuth's advice here to use a power
-  // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
-  // 2 allows us to shift instead of multiply and it is easy to determine the
-  // shift amount from the leading zeros.  We are basically normalizing the u
-  // and v so that its high bits are shifted to the top of v's range without
-  // overflow. Note that this can require an extra word in u so that u must
-  // be of length m+n+1.
-  uint32_t shift = CountLeadingZeros_32(v[n - 1]);
-  uint32_t v_carry = 0;
-  uint32_t u_carry = 0;
-  if (shift) {
-    for (uint32_t i = 0; i < m + n; ++i) {
-      uint32_t u_tmp = (u[i]) >> (32 - shift);
-      u[i] = ((u[i]) << (shift)) | u_carry;
-      u_carry = u_tmp;
-    }
-    for (uint32_t i = 0; i < n; ++i) {
-      uint32_t v_tmp = (v[i]) >> (32 - shift);
-      v[i] = ((v[i]) << (shift)) | v_carry;
-      v_carry = v_tmp;
-    }
-  }
-  u[m + n] = u_carry;
-  // DEBUG(cerr << "KnuthDiv:   normal:");
-  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
-  // u[i]);
-  // DEBUG(cerr << " by");
-  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
-  // v[i-1]);
-  // DEBUG(cerr << '\n');
-
-  // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
-  int j = m;
-  do {
-    // DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n');
-    // D3. [Calculate q'.].
-    //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
-    //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
-    // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
-    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
-    // on v[n-2] determines at high speed most of the cases in which the trial
-    // value qp is one too large, and it eliminates all cases where qp is two
-    // too large.
-    uint64_t dividend = ((uint64_t(u[j + n]) << 32) + u[j + n - 1]);
-    // DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n');
-    uint64_t qp = dividend / v[n - 1];
-    uint64_t rp = dividend % v[n - 1];
-    if (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2]) {
-      qp--;
-      rp += v[n - 1];
-      if (rp < b && (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2])) qp--;
-    }
-    // DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
-
-    // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
-    // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
-    // consists of a simple multiplication by a one-place number, combined with
-    // a subtraction.
-    bool isNeg = false;
-    for (uint32_t i = 0; i < n; ++i) {
-      uint64_t u_tmp = uint64_t(u[j + i]) | ((uint64_t(u[j + i + 1])) << 32);
-      uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
-      bool borrow = subtrahend > u_tmp;
-      /*DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp
-        << ", subtrahend == " << subtrahend
-        << ", borrow = " << borrow << '\n');*/
-
-      uint64_t result = u_tmp - subtrahend;
-      uint32_t k = j + i;
-      u[k++] = (uint32_t)(result & (b - 1)); // subtract low word
-      u[k++] = (uint32_t)((result) >> 32);   // subtract high word
-      while (borrow && k <= m + n) {         // deal with borrow to the left
-        borrow = u[k] == 0;
-        u[k]--;
-        k++;
-      }
-      isNeg |= borrow;
-      /*DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ",  u[j+i+1] == " <<
-        u[j+i+1] << '\n');*/
-    }
-    /*DEBUG(cerr << "KnuthDiv: after subtraction:");
-      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
-      DEBUG(cerr << '\n');*/
-    // The digits (u[j+n]...u[j]) should be kept positive; if the result of
-    // this step is actually negative, (u[j+n]...u[j]) should be left as the
-    // true value plus b**(n+1), namely as the b's complement of
-    // the true value, and a "borrow" to the left should be remembered.
-    //
-    if (isNeg) {
-      bool carry = true; // true because b's complement is "complement + 1"
-      for (uint32_t i = 0; i <= m + n; ++i) {
-        u[i] = ~u[i] + carry; // b's complement
-        carry = carry && u[i] == 0;
-      }
-    }
-    /*DEBUG(cerr << "KnuthDiv: after complement:");
-      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
-      DEBUG(cerr << '\n');*/
-
-    // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
-    // negative, go to step D6; otherwise go on to step D7.
-    q[j] = (uint32_t)qp;
-    if (isNeg) {
-      // D6. [Add back]. The probability that this step is necessary is very
-      // small, on the order of only 2/b. Make sure that test data accounts for
-      // this possibility. Decrease q[j] by 1
-      q[j]--;
-      // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]).
-      // A carry will occur to the left of u[j+n], and it should be ignored
-      // since it cancels with the borrow that occurred in D4.
-      bool carry = false;
-      for (uint32_t i = 0; i < n; i++) {
-        uint32_t limit = AESL_std::min(u[j + i], v[i]);
-        u[j + i] += v[i] + carry;
-        carry = u[j + i] < limit || (carry && u[j + i] == limit);
-      }
-      u[j + n] += carry;
-    }
-    /*DEBUG(cerr << "KnuthDiv: after correction:");
-      DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]);
-      DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');*/
-
-    // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
-  } while (--j >= 0);
-
-  /*DEBUG(cerr << "KnuthDiv: quotient:");
-    DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]);
-    DEBUG(cerr << '\n');*/
-
-  // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
-  // remainder may be obtained by dividing u[...] by d. If r is non-null we
-  // compute the remainder (urem uses this).
-  if (r) {
-    // The value d is expressed by the "shift" value above since we avoided
-    // multiplication by d by using a shift left. So, all we have to do is
-    // shift right here. In order to mak
-    if (shift) {
-      uint32_t carry = 0;
-      // DEBUG(cerr << "KnuthDiv: remainder:");
-      for (int i = n - 1; i >= 0; i--) {
-        r[i] = ((u[i]) >> (shift)) | carry;
-        carry = (u[i]) << (32 - shift);
-        // DEBUG(cerr << " " << r[i]);
-      }
-    } else {
-      for (int i = n - 1; i >= 0; i--) {
-        r[i] = u[i];
-        // DEBUG(cerr << " " << r[i]);
-      }
-    }
-    // DEBUG(cerr << '\n');
-  }
-  // DEBUG(cerr << std::setbase(10) << '\n');
-}
-
-template <int _AP_W, bool _AP_S>
-void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
-            const ap_private<_AP_W, _AP_S>& RHS, uint32_t rhsWords,
-            ap_private<_AP_W, _AP_S>* Quotient,
-            ap_private<_AP_W, _AP_S>* Remainder) {
-  assert(lhsWords >= rhsWords && "Fractional result");
-  enum { APINT_BITS_PER_WORD = 64 };
-  // First, compose the values into an array of 32-bit words instead of
-  // 64-bit words. This is a necessity of both the "short division" algorithm
-  // and the the Knuth "classical algorithm" which requires there to be native
-  // operations for +, -, and * on an m bit value with an m*2 bit result. We
-  // can't use 64-bit operands here because we don't have native results of
-  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
-  // work on large-endian machines.
-  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
-  uint32_t n = rhsWords * 2;
-  uint32_t m = (lhsWords * 2) - n;
-
-  // Allocate space for the temporary values we need either on the stack, if
-  // it will fit, or on the heap if it won't.
-  uint32_t SPACE[128];
-  uint32_t* __U = 0;
-  uint32_t* __V = 0;
-  uint32_t* __Q = 0;
-  uint32_t* __R = 0;
-  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
-    __U = &SPACE[0];
-    __V = &SPACE[m + n + 1];
-    __Q = &SPACE[(m + n + 1) + n];
-    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
-  } else {
-    __U = new uint32_t[m + n + 1];
-    __V = new uint32_t[n];
-    __Q = new uint32_t[m + n];
-    if (Remainder) __R = new uint32_t[n];
-  }
-
-  // Initialize the dividend
-  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
-  for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = LHS.get_pVal(i);
-    __U[i * 2] = (uint32_t)(tmp & mask);
-    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
-  }
-  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
-
-  // Initialize the divisor
-  memset(__V, 0, (n) * sizeof(uint32_t));
-  for (unsigned i = 0; i < rhsWords; ++i) {
-    uint64_t tmp = RHS.get_pVal(i);
-    __V[i * 2] = (uint32_t)(tmp & mask);
-    __V[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
-  }
-
-  // initialize the quotient and remainder
-  memset(__Q, 0, (m + n) * sizeof(uint32_t));
-  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
-
-  // Now, adjust m and n for the Knuth division. n is the number of words in
-  // the divisor. m is the number of words by which the dividend exceeds the
-  // divisor (i.e. m+n is the length of the dividend). These sizes must not
-  // contain any zero words or the Knuth algorithm fails.
-  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
-    n--;
-    m++;
-  }
-  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
-
-  // If we're left with only a single word for the divisor, Knuth doesn't work
-  // so we implement the short division algorithm here. This is much simpler
-  // and faster because we are certain that we can divide a 64-bit quantity
-  // by a 32-bit quantity at hardware speed and short division is simply a
-  // series of such operations. This is just like doing short division but we
-  // are using base 2^32 instead of base 10.
-  assert(n != 0 && "Divide by zero?");
-  if (n == 1) {
-    uint32_t divisor = __V[0];
-    uint32_t remainder = 0;
-    for (int i = m + n - 1; i >= 0; i--) {
-      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
-      if (partial_dividend == 0) {
-        __Q[i] = 0;
-        remainder = 0;
-      } else if (partial_dividend < divisor) {
-        __Q[i] = 0;
-        remainder = (uint32_t)partial_dividend;
-      } else if (partial_dividend == divisor) {
-        __Q[i] = 1;
-        remainder = 0;
-      } else {
-        __Q[i] = (uint32_t)(partial_dividend / divisor);
-        remainder = (uint32_t)(partial_dividend - (__Q[i] * divisor));
-      }
-    }
-    if (__R) __R[0] = remainder;
-  } else {
-    // Now we're ready to invoke the Knuth classical divide algorithm. In this
-    // case n > 1.
-    KnuthDiv(__U, __V, __Q, __R, m, n);
-  }
-
-  // If the caller wants the quotient
-  if (Quotient) {
-    // Set up the Quotient value's memory.
-    if (Quotient->BitWidth != LHS.BitWidth) {
-      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
-    } else
-      Quotient->clear();
-
-    // The quotient is in Q. Reconstitute the quotient into Quotient's low
-    // order words.
-    if (lhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
-      Quotient->set_VAL(tmp);
-    } else {
-      assert(!Quotient->isSingleWord() &&
-             "Quotient ap_private not large enough");
-      for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->set_pVal(
-            i, uint64_t(__Q[i * 2]) |
-                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Quotient->clearUnusedBits();
-  }
-
-  // If the caller wants the remainder
-  if (Remainder) {
-    // Set up the Remainder value's memory.
-    if (Remainder->BitWidth != RHS.BitWidth) {
-      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
-    } else
-      Remainder->clear();
-
-    // The remainder is in R. Reconstitute the remainder into Remainder's low
-    // order words.
-    if (rhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
-      Remainder->set_VAL(tmp);
-    } else {
-      assert(!Remainder->isSingleWord() &&
-             "Remainder ap_private not large enough");
-      for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->set_pVal(
-            i, uint64_t(__R[i * 2]) |
-                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Remainder->clearUnusedBits();
-  }
-
-  // Clean up the memory we allocated.
-  if (__U != &SPACE[0]) {
-    delete[] __U;
-    delete[] __V;
-    delete[] __Q;
-    delete[] __R;
-  }
-}
-
-template <int _AP_W, bool _AP_S>
-void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
-            uint64_t RHS, ap_private<_AP_W, _AP_S>* Quotient,
-            ap_private<_AP_W, _AP_S>* Remainder) {
-  uint32_t rhsWords = 1;
-  assert(lhsWords >= rhsWords && "Fractional result");
-  enum { APINT_BITS_PER_WORD = 64 };
-  // First, compose the values into an array of 32-bit words instead of
-  // 64-bit words. This is a necessity of both the "short division" algorithm
-  // and the the Knuth "classical algorithm" which requires there to be native
-  // operations for +, -, and * on an m bit value with an m*2 bit result. We
-  // can't use 64-bit operands here because we don't have native results of
-  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
-  // work on large-endian machines.
-  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
-  uint32_t n = 2;
-  uint32_t m = (lhsWords * 2) - n;
-
-  // Allocate space for the temporary values we need either on the stack, if
-  // it will fit, or on the heap if it won't.
-  uint32_t SPACE[128];
-  uint32_t* __U = 0;
-  uint32_t* __V = 0;
-  uint32_t* __Q = 0;
-  uint32_t* __R = 0;
-  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
-    __U = &SPACE[0];
-    __V = &SPACE[m + n + 1];
-    __Q = &SPACE[(m + n + 1) + n];
-    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
-  } else {
-    __U = new uint32_t[m + n + 1];
-    __V = new uint32_t[n];
-    __Q = new uint32_t[m + n];
-    if (Remainder) __R = new uint32_t[n];
-  }
-
-  // Initialize the dividend
-  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
-  for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = LHS.get_pVal(i);
-    __U[i * 2] = tmp & mask;
-    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
-  }
-  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
-
-  // Initialize the divisor
-  memset(__V, 0, (n) * sizeof(uint32_t));
-  __V[0] = RHS & mask;
-  __V[1] = (RHS) >> (sizeof(uint32_t) * 8);
-
-  // initialize the quotient and remainder
-  memset(__Q, 0, (m + n) * sizeof(uint32_t));
-  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
-
-  // Now, adjust m and n for the Knuth division. n is the number of words in
-  // the divisor. m is the number of words by which the dividend exceeds the
-  // divisor (i.e. m+n is the length of the dividend). These sizes must not
-  // contain any zero words or the Knuth algorithm fails.
-  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
-    n--;
-    m++;
-  }
-  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
-
-  // If we're left with only a single word for the divisor, Knuth doesn't work
-  // so we implement the short division algorithm here. This is much simpler
-  // and faster because we are certain that we can divide a 64-bit quantity
-  // by a 32-bit quantity at hardware speed and short division is simply a
-  // series of such operations. This is just like doing short division but we
-  // are using base 2^32 instead of base 10.
-  assert(n != 0 && "Divide by zero?");
-  if (n == 1) {
-    uint32_t divisor = __V[0];
-    uint32_t remainder = 0;
-    for (int i = m + n - 1; i >= 0; i--) {
-      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
-      if (partial_dividend == 0) {
-        __Q[i] = 0;
-        remainder = 0;
-      } else if (partial_dividend < divisor) {
-        __Q[i] = 0;
-        remainder = partial_dividend;
-      } else if (partial_dividend == divisor) {
-        __Q[i] = 1;
-        remainder = 0;
-      } else {
-        __Q[i] = partial_dividend / divisor;
-        remainder = partial_dividend - (__Q[i] * divisor);
-      }
-    }
-    if (__R) __R[0] = remainder;
-  } else {
-    // Now we're ready to invoke the Knuth classical divide algorithm. In this
-    // case n > 1.
-    KnuthDiv(__U, __V, __Q, __R, m, n);
-  }
-
-  // If the caller wants the quotient
-  if (Quotient) {
-    // Set up the Quotient value's memory.
-    if (Quotient->BitWidth != LHS.BitWidth) {
-      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
-    } else
-      Quotient->clear();
-
-    // The quotient is in Q. Reconstitute the quotient into Quotient's low
-    // order words.
-    if (lhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
-      Quotient->set_VAL(tmp);
-    } else {
-      assert(!Quotient->isSingleWord() &&
-             "Quotient ap_private not large enough");
-      for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->set_pVal(
-            i, uint64_t(__Q[i * 2]) |
-                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Quotient->clearUnusedBits();
-  }
-
-  // If the caller wants the remainder
-  if (Remainder) {
-    // Set up the Remainder value's memory.
-    if (Remainder->BitWidth != 64 /* RHS.BitWidth */) {
-      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
-    } else
-      Remainder->clear();
-
-    // The remainder is in __R. Reconstitute the remainder into Remainder's low
-    // order words.
-    if (rhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
-      Remainder->set_VAL(tmp);
-    } else {
-      assert(!Remainder->isSingleWord() &&
-             "Remainder ap_private not large enough");
-      for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->set_pVal(
-            i, uint64_t(__R[i * 2]) |
-                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Remainder->clearUnusedBits();
-  }
-
-  // Clean up the memory we allocated.
-  if (__U != &SPACE[0]) {
-    delete[] __U;
-    delete[] __V;
-    delete[] __Q;
-    delete[] __R;
-  }
-}
-
-/// @brief Logical right-shift function.
-template <int _AP_W, bool _AP_S, bool _AP_C>
-INLINE ap_private<_AP_W, _AP_S, _AP_C> lshr(
-    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
-  return LHS.lshr(shiftAmt);
-}
-
-/// Left-shift the ap_private by shiftAmt.
-/// @brief Left-shift function.
-template <int _AP_W, bool _AP_S, bool _AP_C>
-INLINE ap_private<_AP_W, _AP_S, _AP_C> shl(
-    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
-  return LHS.shl(shiftAmt);
-}
-
-} // namespace ap_private_ops
-
-#endif // LLVM_SUPPORT_MATHEXTRAS_H
-
-/// This enumeration just provides for internal constants used in this
-/// translation unit.
-enum {
-  MIN_INT_BITS = 1, ///< Minimum number of bits that can be specified
-  ///< Note that this must remain synchronized with IntegerType::MIN_INT_BITS
-  MAX_INT_BITS = (1 << 23) - 1 ///< Maximum number of bits that can be specified
-  ///< Note that this must remain synchronized with IntegerType::MAX_INT_BITS
-};
-
-//===----------------------------------------------------------------------===//
-//                              ap_private Class
-//===----------------------------------------------------------------------===//
-
-/// ap_private - This class represents arbitrary precision constant integral
-/// values.
-/// It is a functional replacement for common case unsigned integer type like
-/// "unsigned", "unsigned long" or "uint64_t", but also allows non-byte-width
-/// integer sizes and large integer value types such as 3-bits, 15-bits, or more
-/// than 64-bits of precision. ap_private provides a variety of arithmetic
-/// operators
-/// and methods to manipulate integer values of any bit-width. It supports both
-/// the typical integer arithmetic and comparison operations as well as bitwise
-/// manipulation.
-///
-/// The class has several invariants worth noting:
-///   * All bit, byte, and word positions are zero-based.
-///   * Once the bit width is set, it doesn't change except by the Truncate,
-///     SignExtend, or ZeroExtend operations.
-///   * All binary operators must be on ap_private instances of the same bit
-///   width.
-///     Attempting to use these operators on instances with different bit
-///     widths will yield an assertion.
-///   * The value is stored canonically as an unsigned value. For operations
-///     where it makes a difference, there are both signed and unsigned variants
-///     of the operation. For example, sdiv and udiv. However, because the bit
-///     widths must be the same, operations such as Mul and Add produce the same
-///     results regardless of whether the values are interpreted as signed or
-///     not.
-///   * In general, the class tries to follow the style of computation that LLVM
-///     uses in its IR. This simplifies its use for LLVM.
-///
-/// @brief Class for arbitrary precision integers.
-
-#if defined(_MSC_VER)
-#if _MSC_VER < 1400 && !defined(for)
-#define for if (0); else for
-#endif
-typedef unsigned __int64 ap_ulong;
-typedef signed __int64 ap_slong;
-#else
-typedef unsigned long long ap_ulong;
-typedef signed long long ap_slong;
-#endif
-template <int _AP_N8, bool _AP_S>
-struct valtype;
-
-template <int _AP_N8>
-struct valtype<_AP_N8, false> {
-  typedef uint64_t Type;
-};
-
-template <int _AP_N8>
-struct valtype<_AP_N8, true> {
-  typedef int64_t Type;
-};
-
-template <>
-struct valtype<1, false> {
-  typedef unsigned char Type;
-};
-template <>
-struct valtype<2, false> {
-  typedef unsigned short Type;
-};
-template <>
-struct valtype<3, false> {
-  typedef unsigned int Type;
-};
-template <>
-struct valtype<4, false> {
-  typedef unsigned int Type;
-};
-template <>
-struct valtype<1, true> {
-  typedef signed char Type;
-};
-template <>
-struct valtype<2, true> {
-  typedef short Type;
-};
-template <>
-struct valtype<3, true> {
-  typedef int Type;
-};
-template <>
-struct valtype<4, true> {
-  typedef int Type;
-};
-
-template <bool enable>
-struct ap_private_enable_if {};
-template <>
-struct ap_private_enable_if<true> {
-  static const bool isValid = true;
-};
-
-// When bitwidth < 64
-template <int _AP_W, bool _AP_S>
-class ap_private<_AP_W, _AP_S, true> {
-  // SFINAE pattern.  Only consider this class when _AP_W <= 64
-  const static bool valid = ap_private_enable_if<_AP_W <= 64>::isValid;
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
- public:
-  typedef typename valtype<(_AP_W + 7) / 8, _AP_S>::Type ValType;
-  typedef ap_private<_AP_W, _AP_S> Type;
-  template <int _AP_W2, bool _AP_S2>
-  struct RType {
-    enum {
-      mult_w = _AP_W + _AP_W2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-      div_w = _AP_W + _AP_S2,
-      div_s = _AP_S || _AP_S2,
-      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
-      mod_s = _AP_S,
-      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-    typedef ap_private<mult_w, mult_s> mult;
-    typedef ap_private<plus_w, plus_s> plus;
-    typedef ap_private<minus_w, minus_s> minus;
-    typedef ap_private<logic_w, logic_s> logic;
-    typedef ap_private<div_w, div_s> div;
-    typedef ap_private<mod_w, mod_s> mod;
-    typedef ap_private<_AP_W, _AP_S> arg1;
-    typedef bool reduce;
-  };
-  enum { APINT_BITS_PER_WORD = sizeof(uint64_t) * 8 };
-  enum {
-    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
-                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
-                      : 0
-  };
-  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
-  static const uint64_t not_mask = ~mask;
-  static const uint64_t sign_bit_mask = 1ULL << (APINT_BITS_PER_WORD - 1);
-  template <int _AP_W1>
-  struct sign_ext_mask {
-    static const uint64_t mask = ~0ULL << _AP_W1;
-  };
-  static const int width = _AP_W;
-
-  enum {
-    BitWidth = _AP_W,
-    _AP_N = 1,
-  };
-  ValType VAL; ///< Used to store the <= 64 bits integer value.
-#ifdef AP_CANARY
-  ValType CANARY;
-  void check_canary() { assert(CANARY == (ValType)0xDEADBEEFDEADBEEF); }
-  void set_canary() { CANARY = (ValType)0xDEADBEEFDEADBEEF; }
-#else
-  void check_canary() {}
-  void set_canary() {}
-#endif
-
-  INLINE ValType& get_VAL(void) { return VAL; }
-  INLINE ValType get_VAL(void) const { return VAL; }
-  INLINE ValType get_VAL(void) const volatile { return VAL; }
-  INLINE void set_VAL(uint64_t value) { VAL = (ValType)value; }
-  INLINE ValType& get_pVal(int i) { return VAL; }
-  INLINE ValType get_pVal(int i) const { return VAL; }
-  INLINE const uint64_t* get_pVal() const {
-    assert(0 && "invalid usage");
-    return 0;
-  }
-  INLINE ValType get_pVal(int i) const volatile { return VAL; }
-  INLINE uint64_t* get_pVal() const volatile {
-    assert(0 && "invalid usage");
-    return 0;
-  }
-  INLINE void set_pVal(int i, uint64_t value) { VAL = (ValType)value; }
-
-  INLINE uint32_t getBitWidth() const { return BitWidth; }
-
-  template <int _AP_W1, bool _AP_S1>
-  ap_private<_AP_W, _AP_S>& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  ap_private<_AP_W, _AP_S>& operator=(
-      const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(RHS.get_VAL()); // TODO check here about ap_private<W,S,false>
-    clearUnusedBits();
-    return *this;
-  }
-
-  void operator=(const ap_private& RHS) volatile {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-  }
-
-  ap_private& operator=(const ap_private& RHS) {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-    return *this;
-  }
-
-  void operator=(const volatile ap_private& RHS) volatile {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-  }
-
-  ap_private& operator=(const volatile ap_private& RHS) {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    *this = ap_private<_AP_W2, false>(op2);
-    return *this;
-  }
-
-#define ASSIGN_OP_FROM_INT(C_TYPE)               \
-  INLINE ap_private& operator=(const C_TYPE v) { \
-    set_canary();                                \
-    this->VAL = (ValType)v;                      \
-    clearUnusedBits();                           \
-    check_canary();                              \
-    return *this;                                \
-  }
-
-ASSIGN_OP_FROM_INT(bool)
-ASSIGN_OP_FROM_INT(char)
-ASSIGN_OP_FROM_INT(signed char)
-ASSIGN_OP_FROM_INT(unsigned char)
-ASSIGN_OP_FROM_INT(short)
-ASSIGN_OP_FROM_INT(unsigned short)
-ASSIGN_OP_FROM_INT(int)
-ASSIGN_OP_FROM_INT(unsigned int)
-ASSIGN_OP_FROM_INT(long)
-ASSIGN_OP_FROM_INT(unsigned long)
-ASSIGN_OP_FROM_INT(ap_slong)
-ASSIGN_OP_FROM_INT(ap_ulong)
-#if 0
-ASSIGN_OP_FROM_INT(half)
-ASSIGN_OP_FROM_INT(float)
-ASSIGN_OP_FROM_INT(double)
-#endif
-#undef ASSIGN_OP_FROM_INT
-
-  // XXX This is a must to prevent pointer being converted to bool.
-  INLINE ap_private& operator=(const char* s) {
-    ap_private tmp(s); // XXX direct-initialization, as ctor is explicit.
-    operator=(tmp);
-    return *this;
-  }
-
- private:
-  explicit INLINE ap_private(uint64_t* val) : VAL(val[0]) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  INLINE bool isSingleWord() const { return true; }
-
- public:
-  INLINE void fromString(const char* strStart, uint32_t slen, uint8_t radix) {
-    bool isNeg = strStart[0] == '-';
-    if (isNeg) {
-      strStart++;
-      slen--;
-    }
-
-    if (strStart[0] == '0' && (strStart[1] == 'b' || strStart[1] == 'B')) {
-      //if(radix == 0) radix = 2;
-      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", strStart, 2, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (strStart[0] == '0' && (strStart[1] == 'o' || strStart[1] == 'O')) {
-      //if (radix == 0) radix = 8;
-      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", strStart, 8, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (strStart[0] == '0' && (strStart[1] == 'x' || strStart[1] == 'X')) {
-      //if (radix == 0) radix = 16;
-      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", strStart, 16, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (strStart[0] == '0' && (strStart[1] == 'd' || strStart[1] == 'D')) {
-      //if (radix == 0) radix = 10;
-      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", strStart, 10, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (radix == 0) {
-      //radix = 2; // XXX default value
-    }
-
-    // Check our assumptions here
-    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-           "Radix should be 2, 8, 10, or 16!");
-    assert(strStart && "String is null?");
-
-    // Clear bits.
-    uint64_t tmpVAL = VAL = 0;
-
-    switch (radix) {
-      case 2:
-        //        sscanf(strStart,"%b",&VAL);
-        // tmpVAL = *strStart =='1' ? ~0ULL : 0;
-        for (; *strStart; ++strStart) {
-          assert((*strStart == '0' || *strStart == '1') &&
-                 ("Wrong binary number"));
-          tmpVAL <<= 1;
-          tmpVAL |= (*strStart - '0');
-        }
-        break;
-      case 8:
-#ifdef _MSC_VER
-        sscanf_s(strStart, "%llo", &tmpVAL, slen + 1);
-#else
-#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
-        sscanf(strStart, "%lo", &tmpVAL);
-#else
-        sscanf(strStart, "%llo", &tmpVAL);
-#endif //__x86_64__
-#endif //_MSC_VER
-        break;
-      case 10:
-#ifdef _MSC_VER
-        sscanf_s(strStart, "%llu", &tmpVAL, slen + 1);
-#else
-#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
-        sscanf(strStart, "%lu", &tmpVAL);
-#else
-        sscanf(strStart, "%llu", &tmpVAL);
-#endif //__x86_64__
-#endif //_MSC_VER
-        break;
-      case 16:
-#ifdef _MSC_VER
-        sscanf_s(strStart, "%llx", &tmpVAL, slen + 1);
-#else
-#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
-        sscanf(strStart, "%lx", &tmpVAL);
-#else
-        sscanf(strStart, "%llx", &tmpVAL);
-#endif //__x86_64__
-#endif //_MSC_VER
-        break;
-      default:
-        assert(true && "Unknown radix");
-        // error
-    }
-    VAL = isNeg ? (ValType)(-tmpVAL) : (ValType)(tmpVAL);
-
-    clearUnusedBits();
-  }
-
- private:
-  INLINE ap_private(const std::string& val, uint8_t radix = 2) : VAL(0) {
-    assert(!val.empty() && "String empty?");
-    set_canary();
-    fromString(val.c_str(), val.size(), radix);
-    check_canary();
-  }
-
-  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix)
-      : VAL(0) {
-    set_canary();
-    fromString(strStart, slen, radix);
-    check_canary();
-  }
-
-  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[])
-      : VAL(bigVal[0]) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
- public:
-  INLINE ap_private() {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-#define CTOR(TYPE)                              \
-  INLINE ap_private(TYPE v) : VAL((ValType)v) { \
-    set_canary();                               \
-    clearUnusedBits();                          \
-    check_canary();                             \
-  }
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#if 0
-  CTOR(half)
-  CTOR(float)
-  CTOR(double)
-#endif
-#undef CTOR
-
-  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
-  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
-      : VAL((ValType)that.get_VAL()) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
-  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
-      : VAL((ValType)that.get_VAL()) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  explicit INLINE ap_private(const char* val) {
-    set_canary();
-    unsigned char radix = 10;
-    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
-    std::string::size_type pos = str.find('.');
-    // trunc all fraction part
-    if (pos != std::string::npos) str = str.substr(pos);
-
-    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
-    operator=(ap_private_val);
-    check_canary();
-  }
-
-  INLINE ap_private(const char* val, signed char rd) {
-    set_canary();
-    unsigned char radix = rd;
-    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
-    std::string::size_type pos = str.find('.');
-    // trunc all fraction part
-    if (pos != std::string::npos) str = str.substr(pos);
-
-    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
-    operator=(ap_private_val);
-    check_canary();
-  }
-
-  INLINE ~ap_private() { check_canary(); }
-
-  INLINE bool isNegative() const {
-    static const uint64_t sign_mask = 1ULL << (_AP_W - 1);
-    return _AP_S && (sign_mask & VAL);
-  }
-
-  INLINE bool isPositive() const { return !isNegative(); }
-
-  INLINE bool isStrictlyPositive() const { return !isNegative() && VAL != 0; }
-
-  INLINE bool isAllOnesValue() const { return (mask & VAL) == mask; }
-
-  INLINE bool operator==(const ap_private<_AP_W, _AP_S>& RHS) const {
-    return VAL == RHS.get_VAL();
-  }
-  INLINE bool operator==(const ap_private<_AP_W, !_AP_S>& RHS) const {
-    return (uint64_t)VAL == (uint64_t)RHS.get_VAL();
-  }
-
-  INLINE bool operator==(uint64_t Val) const { return ((uint64_t)VAL == Val); }
-  INLINE bool operator!=(uint64_t Val) const { return ((uint64_t)VAL != Val); }
-  INLINE bool operator!=(const ap_private<_AP_W, _AP_S>& RHS) const {
-    return VAL != RHS.get_VAL();
-  }
-  INLINE bool operator!=(const ap_private<_AP_W, !_AP_S>& RHS) const {
-    return (uint64_t)VAL != (uint64_t)RHS.get_VAL();
-  }
-
-  /// postfix increment.
-  const ap_private operator++(int) {
-    ap_private orig(*this);
-    VAL++;
-    clearUnusedBits();
-    return orig;
-  }
-
-  /// prefix increment.
-  const ap_private operator++() {
-    ++VAL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// postfix decrement.
-  const ap_private operator--(int) {
-    ap_private orig(*this);
-    --VAL;
-    clearUnusedBits();
-    return orig;
-  }
-
-  /// prefix decrement.
-  const ap_private operator--() {
-    --VAL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// one's complement.
-  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
-    ap_private<_AP_W + !_AP_S, true> Result(*this);
-    Result.flip();
-    return Result;
-  }
-
-  /// two's complement.
-  INLINE typename RType<1, false>::minus operator-() const {
-    return ap_private<1, false>(0) - (*this);
-  }
-
-  /// logic negation.
-  INLINE bool operator!() const { return !VAL; }
-
-  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
-  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
-    return toString(radix, false);
-  }
-  INLINE std::string toStringSigned(uint8_t radix = 10) const {
-    return toString(radix, true);
-  }
-  INLINE void clear() { VAL = 0; }
-  INLINE ap_private& clear(uint32_t bitPosition) {
-    VAL &= ~(1ULL << (bitPosition));
-    clearUnusedBits();
-    return *this;
-  }
-
-  INLINE ap_private ashr(uint32_t shiftAmt) const {
-    if (_AP_S)
-      return ap_private((shiftAmt == BitWidth) ? 0
-                                               : ((int64_t)VAL) >> (shiftAmt));
-    else
-      return ap_private((shiftAmt == BitWidth) ? 0
-                                               : ((uint64_t)VAL) >> (shiftAmt));
-  }
-
-  INLINE ap_private lshr(uint32_t shiftAmt) const {
-    return ap_private((shiftAmt == BitWidth)
-                          ? ap_private(0)
-                          : ap_private((VAL & mask) >> (shiftAmt)));
-  }
-
-  INLINE ap_private shl(uint32_t shiftAmt) const
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    if (shiftAmt > BitWidth) {
-      if (!isNegative())
-        return ap_private(0);
-      else
-        return ap_private(-1);
-    }
-    if (shiftAmt == BitWidth)
-      return ap_private(0);
-    else
-      return ap_private((VAL) << (shiftAmt));
-    // return ap_private((shiftAmt == BitWidth) ? ap_private(0ULL) :
-    // ap_private(VAL << shiftAmt));
-  }
-
-  INLINE int64_t getSExtValue() const { return VAL; }
-
-  // XXX XXX this function is used in CBE
-  INLINE uint64_t getZExtValue() const { return VAL & mask; }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ref.get();
-    check_canary();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ((uint64_t)(bool)ref);
-    check_canary();
-  }
-
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
-//    set_canary();
-//    *this = ref.get();
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = ((val.operator ap_private<_AP_W2, false>()));
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = (uint64_t)(bool)val;
-//    check_canary();
-//  }
-
-  INLINE void write(const ap_private<_AP_W, _AP_S>& op2) volatile {
-    *this = (op2);
-  }
-
-  // Explicit conversions to C interger types
-  //-----------------------------------------------------------
-  INLINE operator ValType() const { return get_VAL(); }
-
-  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
-
-  INLINE int to_char() const { return (signed char)get_VAL(); }
-
-  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
-
-  INLINE int to_short() const { return (short)get_VAL(); }
-
-  INLINE int to_int() const {
-    //      ap_private<64 /* _AP_W */, _AP_S> res(V);
-    return (int)get_VAL();
-  }
-
-  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
-
-  INLINE long to_long() const { return (long)get_VAL(); }
-
-  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
-
-  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
-
-  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
-
-  INLINE double to_double() const {
-    if (isNegative())
-      return roundToDouble(true);
-    else
-      return roundToDouble(false);
-  }
-
-  INLINE unsigned length() const { return _AP_W; }
-
-  INLINE bool isMinValue() const { return VAL == 0; }
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator&=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) & RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator|=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) | RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator^=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) ^ RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) * RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) + RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) - RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator&(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) &
-                                                RHS.get_VAL());
-      return Ret;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
-      return Ret & RHS;
-    }
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator^(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) ^
-                                                RHS.get_VAL());
-      return Ret;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
-      return Ret ^ RHS;
-    }
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator|(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) |
-                                                RHS.get_VAL());
-      return Ret;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
-      return Ret | RHS;
-    }
-  }
-
-  INLINE ap_private And(const ap_private& RHS) const {
-    return ap_private(VAL & RHS.get_VAL());
-  }
-
-  INLINE ap_private Or(const ap_private& RHS) const {
-    return ap_private(VAL | RHS.get_VAL());
-  }
-
-  INLINE ap_private Xor(const ap_private& RHS) const {
-    return ap_private(VAL ^ RHS.get_VAL());
-  }
-#if 1
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::mult_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::mult Result(((uint64_t)VAL) *
-                                                  RHS.get_VAL());
-      return Result;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::mult Result(*this);
-      Result *= RHS;
-      return Result;
-    }
-  }
-#endif
-  INLINE ap_private Mul(const ap_private& RHS) const {
-    return ap_private(VAL * RHS.get_VAL());
-  }
-
-  INLINE ap_private Add(const ap_private& RHS) const {
-    return ap_private(VAL + RHS.get_VAL());
-  }
-
-  INLINE ap_private Sub(const ap_private& RHS) const {
-    return ap_private(VAL - RHS.get_VAL());
-  }
-
-  INLINE ap_private& operator&=(uint64_t RHS) {
-    VAL &= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator|=(uint64_t RHS) {
-    VAL |= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator^=(uint64_t RHS) {
-    VAL ^= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator*=(uint64_t RHS) {
-    VAL *= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator+=(uint64_t RHS) {
-    VAL += (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator-=(uint64_t RHS) {
-    VAL -= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-
-  INLINE bool isMinSignedValue() const {
-    static const uint64_t min_mask = ~(~0ULL << (_AP_W - 1));
-    return BitWidth == 1 ? VAL == 1
-                         : (ap_private_ops::isNegative<_AP_W>(*this) &&
-                            ((min_mask & VAL) == 0));
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::plus_w <= 64)
-      return typename RType<_AP_W1, _AP_S1>::plus(
-          RType<_AP_W1, _AP_S1>::plus_s
-              ? int64_t(((uint64_t)VAL) + RHS.get_VAL())
-              : uint64_t(((uint64_t)VAL) + RHS.get_VAL()));
-    typename RType<_AP_W1, _AP_S1>::plus Result = RHS;
-    Result += VAL;
-    return Result;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::minus_w <= 64)
-      return typename RType<_AP_W1, _AP_S1>::minus(
-          int64_t(((uint64_t)VAL) - RHS.get_VAL()));
-    typename RType<_AP_W1, _AP_S1>::minus Result = *this;
-    Result -= RHS;
-    return Result;
-  }
-
-  INLINE uint32_t countPopulation() const {
-    return ap_private_ops::CountPopulation_64(VAL);
-  }
-  INLINE uint32_t countLeadingZeros() const {
-    int remainder = BitWidth % 64;
-    int excessBits = (64 - remainder) % 64;
-    uint32_t Count = ap_private_ops::CountLeadingZeros_64(VAL);
-    if (Count) Count -= excessBits;
-    return AESL_std::min(Count, (uint32_t)_AP_W);
-  }
-
-  /// HiBits - This function returns the high "numBits" bits of this ap_private.
-  INLINE ap_private<_AP_W, _AP_S> getHiBits(uint32_t numBits) const {
-    ap_private<_AP_W, _AP_S> ret(*this);
-    ret = (ret) >> (BitWidth - numBits);
-    return ret;
-  }
-
-  /// LoBits - This function returns the low "numBits" bits of this ap_private.
-  INLINE ap_private<_AP_W, _AP_S> getLoBits(uint32_t numBits) const {
-    ap_private<_AP_W, _AP_S> ret(((uint64_t)VAL) << (BitWidth - numBits));
-    ret = (ret) >> (BitWidth - numBits);
-    return ret;
-    // return ap_private(numBits, (VAL << (BitWidth - numBits))>> (BitWidth -
-    // numBits));
-  }
-
-  INLINE ap_private<_AP_W, _AP_S>& set(uint32_t bitPosition) {
-    VAL |= (1ULL << (bitPosition));
-    clearUnusedBits();
-    return *this; // clearUnusedBits();
-  }
-
-  INLINE void set() {
-    VAL = (ValType)~0ULL;
-    clearUnusedBits();
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_private<_AP_W3, false>& val) {
-    operator=(ap_private<_AP_W3, _AP_S>(val));
-  }
-
-  INLINE void set(const ap_private& val) { operator=(val); }
-
-  INLINE void clearUnusedBits(void) volatile
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
-    VAL = (ValType)(
-        _AP_S
-            ? ((((int64_t)VAL) << (excess_bits)) >> (excess_bits))
-            : (excess_bits ? (((uint64_t)VAL) << (excess_bits)) >> (excess_bits)
-                           : (uint64_t)VAL));
-  }
-
-  INLINE void clearUnusedBitsToZero(void) {
-    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
-    static uint64_t mask = ~0ULL >> (excess_bits);
-    VAL &= mask;
-  }
-
-  INLINE ap_private udiv(const ap_private& RHS) const {
-    return ap_private((uint64_t)VAL / RHS.get_VAL());
-  }
-
-  /// Signed divide this ap_private by ap_private RHS.
-  /// @brief Signed division function for ap_private.
-  INLINE ap_private sdiv(const ap_private& RHS) const {
-    if (isNegative())
-      if (RHS.isNegative())
-        return ((uint64_t)(0 - (*this))) / (uint64_t)(0 - RHS);
-      else
-        return 0 - ((uint64_t)(0 - (*this)) / (uint64_t)(RHS));
-    else if (RHS.isNegative())
-      return 0 - (this->udiv((ap_private)(0 - RHS)));
-    return this->udiv(RHS);
-  }
-
-  template <bool _AP_S2>
-  INLINE ap_private urem(const ap_private<_AP_W, _AP_S2>& RHS) const {
-    assert(RHS.get_VAL() != 0 && "Divide by 0");
-    return ap_private(((uint64_t)VAL) % ((uint64_t)RHS.get_VAL()));
-  }
-
-  /// Signed remainder operation on ap_private.
-  /// @brief Function for signed remainder operation.
-  template <bool _AP_S2>
-  INLINE ap_private srem(const ap_private<_AP_W, _AP_S2>& RHS) const {
-    if (isNegative()) {
-      ap_private lhs = 0 - (*this);
-      if (RHS.isNegative()) {
-        ap_private rhs = 0 - RHS;
-        return 0 - (lhs.urem(rhs));
-      } else
-        return 0 - (lhs.urem(RHS));
-    } else if (RHS.isNegative()) {
-      ap_private rhs = 0 - RHS;
-      return this->urem(rhs);
-    }
-    return this->urem(RHS);
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool eq(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return (*this) == RHS;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ne(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !((*this) == RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the less-than relationship.
-  /// @returns true if *this < RHS when both are considered unsigned.
-  /// @brief Unsigned less than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ult(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (_AP_W1 <= 64) {
-      uint64_t lhsZext = ((uint64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
-      uint64_t rhsZext =
-          ((uint64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
-      return lhsZext < rhsZext;
-    } else
-      return RHS.uge(*this);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the less-than relationship.
-  /// @returns true if *this < RHS when both are considered signed.
-  /// @brief Signed less than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool slt(const ap_private<_AP_W1, _AP_S1>& RHS) const
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    if (_AP_W1 <= 64) {
-      int64_t lhsSext = ((int64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
-      int64_t rhsSext =
-          ((int64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
-      return lhsSext < rhsSext;
-    } else
-      return RHS.sge(*this);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered unsigned.
-  /// @brief Unsigned less or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ule(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return ult(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered signed.
-  /// @brief Signed less or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool sle(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return slt(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered unsigned.
-  /// @brief Unsigned greather than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ugt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !ult(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered signed.
-  /// @brief Signed greather than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool sgt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !slt(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered unsigned.
-  /// @brief Unsigned greater or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool uge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !ult(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered signed.
-  /// @brief Signed greather or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool sge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !slt(RHS);
-  }
-
-  INLINE ap_private abs() const {
-    if (isNegative()) return -(*this);
-    return *this;
-  }
-
-  INLINE ap_private<_AP_W, false> get() const {
-    ap_private<_AP_W, false> ret(*this);
-    return ret;
-  }
-
-  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
-                                       uint8_t radix) {
-    return _AP_W;
-  }
-
-  INLINE uint32_t getActiveBits() const {
-    uint32_t bits = _AP_W - countLeadingZeros();
-    return bits ? bits : 1;
-  }
-
-  INLINE double roundToDouble(bool isSigned = false) const {
-    return isSigned ? double((int64_t)VAL) : double((uint64_t)VAL);
-  }
-
-  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
-   * versa*/
-  INLINE ap_private& reverse() {
-    for (int i = 0; i < _AP_W / 2; ++i) {
-      bool tmp = operator[](i);
-      if (operator[](_AP_W - 1 - i))
-        set(i);
-      else
-        clear(i);
-      if (tmp)
-        set(_AP_W - 1 - i);
-      else
-        clear(_AP_W - 1 - i);
-    }
-    clearUnusedBits();
-    return *this;
-  }
-
-  /*Return true if the value of ap_private instance is zero*/
-  INLINE bool iszero() const { return isMinValue(); }
-
-  INLINE bool to_bool() const { return !iszero(); }
-
-  /* x < 0 */
-  INLINE bool sign() const {
-    if (isNegative()) return true;
-    return false;
-  }
-
-  /* x[i] = !x[i] */
-  INLINE void invert(int i) {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    flip(i);
-  }
-
-  /* x[i] */
-  INLINE bool test(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return operator[](i);
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the left
-  INLINE void lrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(shl(n) | lshr(_AP_W - n));
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the right
-  INLINE void rrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(lshr(n) | shl(_AP_W - n));
-  }
-
-  // Set the ith bit into v
-  INLINE void set(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // Set the ith bit into v
-  INLINE void set_bit(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // Get the value of ith bit
-  INLINE bool get_bit(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return (((1ULL << i) & VAL) != 0);
-  }
-
-  /// Toggle all bits.
-  INLINE ap_private& flip() {
-    VAL = (ValType)((~0ULL ^ VAL) & mask);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// Toggles a given bit to its opposite value.
-  INLINE ap_private& flip(uint32_t bitPosition) {
-    assert(bitPosition < BitWidth && "Out of the bit-width range!");
-    set_bit(bitPosition, !get_bit(bitPosition));
-    return *this;
-  }
-
-  // complements every bit
-  INLINE void b_not() { flip(); }
-
-// Binary Arithmetic
-//-----------------------------------------------------------
-#define OP_BIN_AP(Sym, Rty, Fun)                           \
-  template <int _AP_W2, bool _AP_S2>                       \
-  INLINE typename RType<_AP_W2, _AP_S2>::Rty operator Sym( \
-      const ap_private<_AP_W2, _AP_S2>& op) const {        \
-    typename RType<_AP_W2, _AP_S2>::Rty lhs(*this);        \
-    typename RType<_AP_W2, _AP_S2>::Rty rhs(op);           \
-    return lhs.Fun(rhs);                                   \
-  }
-
-/// Bitwise and, or, xor
-// OP_BIN_AP(&,logic, And)
-// OP_BIN_AP(|,logic, Or)
-// OP_BIN_AP(^,logic, Xor)
-#undef OP_BIN_AP
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    return typename RType<_AP_W2, _AP_S2>::div(
-        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    typename RType<_AP_W2, _AP_S2>::mod res =
-        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
-                                                  : lhs.urem(rhs));
-    return res;
-  }
-
-#define OP_ASSIGN_AP_2(Sym)                         \
-  template <int _AP_W2, bool _AP_S2>                \
-  INLINE ap_private<_AP_W, _AP_S>& operator Sym##=( \
-      const ap_private<_AP_W2, _AP_S2>& op) {       \
-    *this = operator Sym(op);                       \
-    return *this;                                   \
-  }
-
-  OP_ASSIGN_AP_2(/)
-  OP_ASSIGN_AP_2(%)
-#undef OP_ASSIGN_AP_2
-
-/// Bitwise assign: and, or, xor
-//-------------------------------------------------------------
-//    OP_ASSIGN_AP(&)
-//    OP_ASSIGN_AP(^)
-//    OP_ASSIGN_AP(|)
-
-#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
-  INLINE ap_private operator<<(const TYPE op) const { \
-    if (op >= _AP_W) return ap_private(0);            \
-    if (SIGNED && op < 0) return *this >> (0 - op);   \
-    return shl(op);                                   \
-  }
-
-  // OP_LEFT_SHIFT_CTYPE(bool, false)
-  OP_LEFT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
-  OP_LEFT_SHIFT_CTYPE(signed char, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
-  OP_LEFT_SHIFT_CTYPE(short, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
-  OP_LEFT_SHIFT_CTYPE(int, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
-  OP_LEFT_SHIFT_CTYPE(long, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
-  OP_LEFT_SHIFT_CTYPE(long long, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
-#if 0
-  OP_LEFT_SHIFT_CTYPE(half, false)
-  OP_LEFT_SHIFT_CTYPE(float, false)
-  OP_LEFT_SHIFT_CTYPE(double, false)
-#endif
-
-#undef OP_LEFT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this << sh;
-    } else {
-      int sh = op2.to_int();
-      return *this << sh;
-    }
-  }
-
-#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
-  INLINE ap_private operator>>(const TYPE op) const { \
-    if (op >= _AP_W) {                                \
-      if (isNegative())                               \
-        return ap_private(-1);                        \
-      else                                            \
-        return ap_private(0);                         \
-    }                                                 \
-    if ((SIGNED) && op < 0) return *this << (0 - op); \
-    if (_AP_S)                                        \
-      return ashr(op);                                \
-    else                                              \
-      return lshr(op);                                \
-  }
-
-  // OP_RIGHT_SHIFT_CTYPE(bool, false)
-  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
-  OP_RIGHT_SHIFT_CTYPE(signed char, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
-  OP_RIGHT_SHIFT_CTYPE(short, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
-  OP_RIGHT_SHIFT_CTYPE(int, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
-  OP_RIGHT_SHIFT_CTYPE(long, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
-  OP_RIGHT_SHIFT_CTYPE(long long, true)
-#if 0
-  OP_RIGHT_SHIFT_CTYPE(half, false)
-  OP_RIGHT_SHIFT_CTYPE(float, false)
-  OP_RIGHT_SHIFT_CTYPE(double, false)
-#endif
-
-#undef OP_RIGHT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this >> sh;
-    } else {
-      int sh = op2.to_int();
-      return *this >> sh;
-    }
-  }
-
-  /// Shift assign
-  //-----------------------------------------------------------------
-
-  //INLINE const ap_private& operator<<=(uint32_t shiftAmt) {
-  //  VAL <<= shiftAmt;
-  //  clearUnusedBits();
-  //  return *this;
-  //}
-
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(int op) {                               \
-    *this = operator Sym(op);                                                \
-    clearUnusedBits();                                                       \
-    return *this;                                                            \
-  }                                                                          \
-  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
-    *this = operator Sym(op);                                                \
-    clearUnusedBits();                                                       \
-    return *this;                                                            \
-  }                                                                          \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
-    *this = operator Sym(op);                                                \
-    clearUnusedBits();                                                       \
-    return *this;                                                            \
-  }
-
-  OP_ASSIGN_AP(>>)
-  OP_ASSIGN_AP(<<)
-#undef OP_ASSIGN_AP
-
-  /// Comparisons
-  //-----------------------------------------------------------------
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool operator==(const ap_private<_AP_W1, _AP_S1>& op) const {
-    enum { _AP_MAX_W = AP_MAX(AP_MAX(_AP_W, _AP_W1), 32) };
-    ap_private<_AP_MAX_W, false> lhs(*this);
-    ap_private<_AP_MAX_W, false> rhs(op);
-    if (_AP_MAX_W <= 64) {
-      return (uint64_t)lhs.get_VAL() == (uint64_t)rhs.get_VAL();
-    } else
-      return lhs == rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this == op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    // this will follow gcc rule for comparison
-    // between different bitwidth and signness
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
-    else if (_AP_W < 32 && _AP_W2 < 32)
-      // different signness but both bitwidth is less than 32
-      return lhs.sgt(rhs);
-    else
-        // different signness but bigger bitwidth
-        // is greater or equal to 32
-        if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ugt(rhs);
-      else
-        return lhs.sgt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ugt(rhs);
-    else
-      return lhs.sgt(rhs);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this > op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
-    else if (_AP_W < 32 && _AP_W2 < 32)
-      return lhs.slt(rhs);
-    else if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ult(rhs);
-      else
-        return lhs.slt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ult(rhs);
-    else
-      return lhs.slt(rhs);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this < op);
-  }
-
-  /// Bit and Part Select
-  //--------------------------------------------------------------
-  // FIXME now _private_range_ref refs to _AP_ROOT_TYPE(struct ssdm_int).
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-//                                                                         a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-//                                                                       a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                    &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-//            a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(
-//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this & a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this | a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this ^ a2.get();
-//  }
-
-  // Reduce operation
-  //-----------------------------------------------------------
-  INLINE bool and_reduce() const { return (VAL & mask) == mask; }
-
-  INLINE bool nand_reduce() const { return (VAL & mask) != mask; }
-
-  INLINE bool or_reduce() const { return (bool)VAL; }
-
-  INLINE bool nor_reduce() const { return VAL == 0; }
-
-  INLINE bool xor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? true : false;
-  }
-
-  INLINE bool xnor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? false : true;
-  }
-
-  INLINE std::string to_string(uint8_t radix = 2, bool sign = false) const {
-    return toString(radix, radix == 10 ? _AP_S : sign);
-  }
-}; // End of class ap_private <_AP_W, _AP_S, true>
-
-template <int _AP_W, bool _AP_S>
-std::string ap_private<_AP_W, _AP_S, true>::toString(uint8_t radix,
-                                                     bool wantSigned) const {
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-         "Radix should be 2, 8, 10, or 16!");
-  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
-                                 "8", "9", "a", "b", "c", "d", "e", "f"};
-  std::string result;
-  if (radix != 10) {
-    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
-    // because the number of bits per digit (1,3 and 4 respectively) divides
-    // equaly. We just shift until there value is zero.
-
-    // First, check for a zero value and just short circuit the logic below.
-    if (*this == (uint64_t)(0)) {
-      // Always generate a radix indicator because fixed-point
-      // formats require it.
-      switch (radix) {
-        case 2:
-          result = "0b0";
-          break;
-        case 8:
-          result = "0o0";
-          break;
-        case 16:
-          result = "0x0";
-          break;
-        default:
-          assert("invalid radix" && 0);
-      }
-    } else {
-      ap_private<_AP_W, false, true> tmp(*this);
-      size_t insert_at = 0;
-      bool leading_zero = true;
-      if (wantSigned && isNegative()) {
-        // They want to print the signed version and it is a negative value
-        // Flip the bits and add one to turn it into the equivalent positive
-        // value and put a '-' in the result.
-        tmp.flip();
-        tmp++;
-        result = "-";
-        insert_at = 1;
-        leading_zero = false;
-      }
-      switch (radix) {
-        case 2:
-          result += "0b";
-          break;
-        case 8:
-          result += "0o";
-          break;
-        case 16:
-          result += "0x";
-          break;
-        default:
-          assert("invalid radix" && 0);
-      }
-      insert_at += 2;
-
-      // Just shift tmp right for each digit width until it becomes zero
-      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
-      uint64_t mask = radix - 1;
-      ap_private<_AP_W, false, true> zero(0);
-      unsigned bits = 0;
-      bool msb = false;
-      while (tmp.ne(zero)) {
-        unsigned digit = (unsigned)(tmp.get_VAL() & mask);
-        result.insert(insert_at, digits[digit]);
-        tmp = tmp.lshr(shift);
-        bits++;
-        msb = (digit >> (shift - 1)) == 1;
-      }
-      bits *= shift;
-      if (bits < _AP_W && leading_zero && msb)
-        result.insert(insert_at, digits[0]);
-    }
-    return result;
-  }
-
-  ap_private<_AP_W, false, true> tmp(*this);
-  ap_private<6, false, true> divisor(radix);
-  ap_private<_AP_W, _AP_S, true> zero(0);
-  size_t insert_at = 0;
-  if (wantSigned && isNegative()) {
-    // They want to print the signed version and it is a negative value
-    // Flip the bits and add one to turn it into the equivalent positive
-    // value and put a '-' in the result.
-    tmp.flip();
-    tmp++;
-    result = "-";
-    insert_at = 1;
-  }
-  if (tmp == ap_private<_AP_W, false, true>(0ULL))
-    result = "0";
-  else
-    while (tmp.ne(zero)) {
-      ap_private<_AP_W, false, true> APdigit = tmp % divisor;
-      ap_private<_AP_W, false, true> tmp2 = tmp / divisor;
-      uint32_t digit = (uint32_t)(APdigit.getZExtValue());
-      assert(digit < radix && "divide failed");
-      result.insert(insert_at, digits[digit]);
-      tmp = tmp2;
-    }
-  return result;
-
-} // End of ap_private<_AP_W, _AP_S, true>::toString()
-
-// bitwidth > 64
-template <int _AP_W, bool _AP_S>
-class ap_private<_AP_W, _AP_S, false> {
-  // SFINAE pattern.  Only consider this class when _AP_W > 64
-  const static bool valid = ap_private_enable_if<(_AP_W > 64)>::isValid;
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
- public:
-  enum { BitWidth = _AP_W, _AP_N = (_AP_W + 63) / 64 };
-  static const int width = _AP_W;
-
- private:
-  /// This constructor is used only internally for speed of construction of
-  /// temporaries. It is unsafe for general use so it is not public.
-
-  /* Constructors */
-  /// Note that numWords can be smaller or larger than the corresponding bit
-  /// width but any extraneous bits will be dropped.
-  /// @param numWords the number of words in bigVal
-  /// @param bigVal a sequence of words to form the initial value of the
-  /// ap_private
-  /// @brief Construct an ap_private, initialized as bigVal[].
-  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[]) {
-    set_canary();
-    assert(bigVal && "Null pointer detected!");
-    {
-      // Get memory, cleared to 0
-      memset(pVal, 0, _AP_N * sizeof(uint64_t));
-
-      // Calculate the number of words to copy
-      uint32_t words = AESL_std::min<uint32_t>(numWords, _AP_N);
-      // Copy the words from bigVal to pVal
-      memcpy(pVal, bigVal, words * APINT_WORD_SIZE);
-      if (words >= _AP_W) clearUnusedBits();
-      // Make sure unused high bits are cleared
-    }
-    check_canary();
-  }
-
-  /// This constructor interprets Val as a string in the given radix. The
-  /// interpretation stops when the first charater that is not suitable for the
-  /// radix is encountered. Acceptable radix values are 2, 8, 10 and 16. It is
-  /// an error for the value implied by the string to require more bits than
-  /// numBits.
-  /// @param val the string to be interpreted
-  /// @param radix the radix of Val to use for the intepretation
-  /// @brief Construct an ap_private from a string representation.
-  INLINE ap_private(const std::string& val, uint8_t radix = 2) {
-    set_canary();
-    assert(!val.empty() && "The input string is empty.");
-    const char* c_str = val.c_str();
-    fromString(c_str, val.size(), radix);
-    check_canary();
-  }
-
-  /// This constructor interprets the slen characters starting at StrStart as
-  /// a string in the given radix. The interpretation stops when the first
-  /// character that is not suitable for the radix is encountered. Acceptable
-  /// radix values are 2, 8, 10 and 16. It is an error for the value implied by
-  /// the string to require more bits than numBits.
-  /// @param strStart the start of the string to be interpreted
-  /// @param slen the maximum number of characters to interpret
-  /// @param radix the radix to use for the conversion
-  /// @brief Construct an ap_private from a string representation.
-  /// This method does not consider whether it is negative or not.
-  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix) {
-    set_canary();
-    fromString(strStart, slen, radix);
-    check_canary();
-  }
-
-  INLINE void report() {
-    _AP_ERROR(_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024,
-              "ap_%sint<%d>: Bitwidth exceeds the "
-              "default max value %d. Please use macro "
-              "AP_INT_MAX_W to set a larger max value.",
-              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
-  }
-  /// This union is used to store the integer value. When the
-  /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal.
-
-  /// This enum is used to hold the constants we needed for ap_private.
-  // uint64_t VAL;    ///< Used to store the <= 64 bits integer value.
-  uint64_t pVal[_AP_N]; ///< Used to store the >64 bits integer value.
-#ifdef AP_CANARY
-  uint64_t CANARY;
-  INLINE void check_canary() { assert(CANARY == (uint64_t)0xDEADBEEFDEADBEEF); }
-  INLINE void set_canary() { CANARY = (uint64_t)0xDEADBEEFDEADBEEF; }
-#else
-  INLINE void check_canary() {}
-  INLINE void set_canary() {}
-#endif
-
- public:
-  typedef typename valtype<8, _AP_S>::Type ValType;
-  typedef ap_private<_AP_W, _AP_S> Type;
-  // FIXME remove friend type?
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  friend struct ap_fixed_base;
-  /// return type of variety of operations
-  //----------------------------------------------------------
-  template <int _AP_W2, bool _AP_S2>
-  struct RType {
-    enum {
-      mult_w = _AP_W + _AP_W2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-      div_w = _AP_W + _AP_S2,
-      div_s = _AP_S || _AP_S2,
-      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
-      mod_s = _AP_S,
-      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-    typedef ap_private<mult_w, mult_s> mult;
-    typedef ap_private<plus_w, plus_s> plus;
-    typedef ap_private<minus_w, minus_s> minus;
-    typedef ap_private<logic_w, logic_s> logic;
-    typedef ap_private<div_w, div_s> div;
-    typedef ap_private<mod_w, mod_s> mod;
-    typedef ap_private<_AP_W, _AP_S> arg1;
-    typedef bool reduce;
-  };
-
-  INLINE uint64_t& get_VAL(void) { return pVal[0]; }
-  INLINE uint64_t get_VAL(void) const { return pVal[0]; }
-  INLINE uint64_t get_VAL(void) const volatile { return pVal[0]; }
-  INLINE void set_VAL(uint64_t value) { pVal[0] = value; }
-  INLINE uint64_t& get_pVal(int index) { return pVal[index]; }
-  INLINE uint64_t* get_pVal() { return pVal; }
-  INLINE const uint64_t* get_pVal() const { return pVal; }
-  INLINE uint64_t get_pVal(int index) const { return pVal[index]; }
-  INLINE uint64_t* get_pVal() const volatile { return pVal; }
-  INLINE uint64_t get_pVal(int index) const volatile { return pVal[index]; }
-  INLINE void set_pVal(int i, uint64_t value) { pVal[i] = value; }
-
-  /// This enum is used to hold the constants we needed for ap_private.
-  enum {
-    APINT_BITS_PER_WORD = sizeof(uint64_t) * 8, ///< Bits in a word
-    APINT_WORD_SIZE = sizeof(uint64_t)          ///< Byte size of a word
-  };
-
-  enum {
-    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
-                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
-                      : 0
-  };
-  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
-
- public:
-  // NOTE changed to explicit to be consistent with ap_private<W,S,true>
-  explicit INLINE ap_private(const char* val) {
-    set_canary();
-    unsigned char radix = 10;
-    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
-    std::string::size_type pos = str.find('.');
-    if (pos != std::string::npos) str = str.substr(pos);
-    ap_private ap_private_val(str, radix);
-    operator=(ap_private_val);
-    report();
-    check_canary();
-  }
-
-  INLINE ap_private(const char* val, unsigned char rd) {
-    set_canary();
-    unsigned char radix = rd;
-    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
-    std::string::size_type pos = str.find('.');
-    if (pos != std::string::npos) str = str.substr(pos);
-    ap_private ap_private_val(str, radix);
-    operator=(ap_private_val);
-    report();
-
-    report();
-    check_canary();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ref.get();
-    report();
-    check_canary();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ((uint64_t)(bool)ref);
-    report();
-    check_canary();
-  }
-
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
-//    set_canary();
-//    *this = ref.get();
-//    report();
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = ((val.operator ap_private<_AP_W2, false>()));
-//    report();
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = (uint64_t)(bool)val;
-//    report();
-//    check_canary();
-//  }
-
-  /// Simply makes *this a copy of that.
-  /// @brief Copy Constructor.
-  INLINE ap_private(const ap_private& that) {
-      set_canary();
-      memcpy(pVal, that.get_pVal(), _AP_N * APINT_WORD_SIZE);
-      clearUnusedBits();
-      check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, false>& that) {
-    set_canary();
-    operator=(that);
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, false>& that) {
-    set_canary();
-    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, false>&>(that));
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, true>& that) {
-    set_canary();
-    static const uint64_t that_sign_ext_mask =
-        (_AP_W1 == APINT_BITS_PER_WORD)
-            ? 0
-            : ~0ULL >> (_AP_W1 % APINT_BITS_PER_WORD)
-                           << (_AP_W1 % APINT_BITS_PER_WORD);
-    if (that.isNegative()) {
-      pVal[0] = that.get_VAL() | that_sign_ext_mask;
-      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1));
-    } else {
-      pVal[0] = that.get_VAL();
-      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));
-    }
-    clearUnusedBits();
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, true>& that) {
-    set_canary();
-    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, true>&>(that));
-    check_canary();
-  }
-
-  /// @brief Destructor.
-  // virtual ~ap_private() {}
-  INLINE ~ap_private() { check_canary(); }
-
-  /// @name Constructors
-  /// @{
-
-  /// Default constructor that creates an uninitialized ap_private.  This is
-  /// useful
-  ///  for object deserialization (pair this with the static method Read).
-  INLINE ap_private() {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  INLINE ap_private(uint64_t* val, uint32_t bits = _AP_W) { assert(0); }
-  INLINE ap_private(const uint64_t* const val, uint32_t bits) { assert(0); }
-
-/// If isSigned is true then val is treated as if it were a signed value
-/// (i.e. as an int64_t) and the appropriate sign extension to the bit width
-/// will be done. Otherwise, no sign extension occurs (high order bits beyond
-/// the range of val are zero filled).
-/// @param numBits the bit width of the constructed ap_private
-/// @param val the initial value of the ap_private
-/// @param isSigned how to treat signedness of val
-/// @brief Create a new ap_private of numBits width, initialized as val.
-#define CTOR(TYPE, SIGNED)                                  \
-  INLINE ap_private(TYPE val, bool isSigned = SIGNED) {     \
-    set_canary();                                           \
-    pVal[0] = (ValType)val;                                 \
-    if (isSigned && int64_t(pVal[0]) < 0) {                 \
-      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1)); \
-    } else {                                                \
-      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));  \
-    }                                                       \
-    clearUnusedBits();                                      \
-    check_canary();                                         \
-  }
-
-  CTOR(bool, false)
-  CTOR(char, CHAR_IS_SIGNED)
-  CTOR(signed char, true)
-  CTOR(unsigned char, false)
-  CTOR(short, true)
-  CTOR(unsigned short, false)
-  CTOR(int, true)
-  CTOR(unsigned int, false)
-  CTOR(long, true)
-  CTOR(unsigned long, false)
-  CTOR(ap_slong, true)
-  CTOR(ap_ulong, false)
-#if 0
-  CTOR(half, false)
-  CTOR(float, false)
-  CTOR(double, false)
-#endif
-#undef CTOR
-
-  /// @returns true if the number of bits <= 64, false otherwise.
-  /// @brief Determine if this ap_private just has one word to store value.
-  INLINE bool isSingleWord() const { return false; }
-
-  /// @returns the word position for the specified bit position.
-  /// @brief Determine which word a bit is in.
-  static INLINE uint32_t whichWord(uint32_t bitPosition) {
-    //    return bitPosition / APINT_BITS_PER_WORD;
-    return (bitPosition) >> 6;
-  }
-
-  /// @returns the bit position in a word for the specified bit position
-  /// in the ap_private.
-  /// @brief Determine which bit in a word a bit is in.
-  static INLINE uint32_t whichBit(uint32_t bitPosition) {
-    //    return bitPosition % APINT_BITS_PER_WORD;
-    return bitPosition & 0x3f;
-  }
-
-  /// bit at a specific bit position. This is used to mask the bit in the
-  /// corresponding word.
-  /// @returns a uint64_t with only bit at "whichBit(bitPosition)" set
-  /// @brief Get a single bit mask.
-  static INLINE uint64_t maskBit(uint32_t bitPosition) {
-    return 1ULL << (whichBit(bitPosition));
-  }
-
-  /// @returns the corresponding word for the specified bit position.
-  /// @brief Get the word corresponding to a bit position
-  INLINE uint64_t getWord(uint32_t bitPosition) const {
-    return pVal[whichWord(bitPosition)];
-  }
-
-  /// This method is used internally to clear the to "N" bits in the high order
-  /// word that are not used by the ap_private. This is needed after the most
-  /// significant word is assigned a value to ensure that those bits are
-  /// zero'd out.
-  /// @brief Clear unused high order bits
-  INLINE void clearUnusedBits(void) volatile
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    pVal[_AP_N - 1] =
-        _AP_S ? ((((int64_t)pVal[_AP_N - 1]) << (excess_bits)) >> excess_bits)
-              : (excess_bits
-                     ? ((pVal[_AP_N - 1]) << (excess_bits)) >> (excess_bits)
-                     : pVal[_AP_N - 1]);
-  }
-
-  INLINE void clearUnusedBitsToZero(void) { pVal[_AP_N - 1] &= mask; }
-
-  INLINE void clearUnusedBitsToOne(void) { pVal[_AP_N - 1] |= mask; }
-
-  /// This is used by the constructors that take string arguments.
-  /// @brief Convert a char array into an ap_private
-  INLINE void fromString(const char* str, uint32_t slen, uint8_t radix) {
-    enum { numbits = _AP_W };
-    bool isNeg = str[0] == '-';
-    if (isNeg) {
-      str++;
-      slen--;
-    }
-
-    if (str[0] == '0' && (str[1] == 'b' || str[1] == 'B')) {
-      //if(radix == 0) radix = 2;
-      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", str, 2, radix);
-      str += 2;
-      slen -=2;
-    } else if (str[0] == '0' && (str[1] == 'o' || str[1] == 'O')) {
-      //if (radix == 0) radix = 8;
-      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", str, 8, radix);
-      str += 2;
-      slen -=2;
-    } else if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) {
-      //if (radix == 0) radix = 16;
-      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", str, 16, radix);
-      str += 2;
-      slen -=2;
-    } else if (str[0] == '0' && (str[1] == 'd' || str[1] == 'D')) {
-      //if (radix == 0) radix = 10;
-      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", str, 10, radix);
-      str += 2;
-      slen -=2;
-    } else if (radix == 0) {
-      //radix = 2; // XXX default value
-    }
-
-    // Check our assumptions here
-    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-           "Radix should be 2, 8, 10, or 16!");
-    assert(str && "String is null?");
-
-    // skip any leading zero
-    while (*str == '0' && *(str + 1) != '\0') {
-      str++;
-      slen--;
-    }
-    assert((slen <= numbits || radix != 2) && "Insufficient bit width");
-    assert(((slen - 1) * 3 <= numbits || radix != 8) &&
-           "Insufficient bit width");
-    assert(((slen - 1) * 4 <= numbits || radix != 16) &&
-           "Insufficient bit width");
-    assert((((slen - 1) * 64) / 22 <= numbits || radix != 10) &&
-           "Insufficient bit width");
-
-    // clear bits
-    memset(pVal, 0, _AP_N * sizeof(uint64_t));
-
-    // Figure out if we can shift instead of multiply
-    uint32_t shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
-
-    // Set up an ap_private for the digit to add outside the loop so we don't
-    // constantly construct/destruct it.
-    uint64_t bigVal[_AP_N];
-    memset(bigVal, 0, _AP_N * sizeof(uint64_t));
-    ap_private<_AP_W, _AP_S> apdigit(getBitWidth(), bigVal);
-    ap_private<_AP_W, _AP_S> apradix(radix);
-
-    // Enter digit traversal loop
-    for (unsigned i = 0; i < slen; i++) {
-      // Get a digit
-      uint32_t digit = 0;
-      char cdigit = str[i];
-      if (radix == 16) {
-#define isxdigit(c)                                            \
-  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
-   ((c) >= 'A' && (c) <= 'F'))
-#define isdigit(c) ((c) >= '0' && (c) <= '9')
-        if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
-        if (isdigit(cdigit))
-          digit = cdigit - '0';
-        else if (cdigit >= 'a')
-          digit = cdigit - 'a' + 10;
-        else if (cdigit >= 'A')
-          digit = cdigit - 'A' + 10;
-        else
-          assert(0 && "huh? we shouldn't get here");
-      } else if (isdigit(cdigit)) {
-        digit = cdigit - '0';
-      } else if (cdigit != '\0') {
-        assert(0 && "Invalid character in digit string");
-      }
-#undef isxdigit
-#undef isdigit
-      // Shift or multiply the value by the radix
-      if (shift)
-        *this <<= shift;
-      else
-        *this *= apradix;
-
-      // Add in the digit we just interpreted
-      apdigit.set_VAL(digit);
-      *this += apdigit;
-    }
-    // If its negative, put it in two's complement form
-    if (isNeg) {
-      (*this)--;
-      this->flip();
-    }
-    clearUnusedBits();
-  }
-
-  INLINE ap_private read() volatile { return *this; }
-
-  INLINE void write(const ap_private& op2) volatile { *this = (op2); }
-
-  INLINE operator ValType() const { return get_VAL(); }
-
-  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
-
-  INLINE int to_char() const { return (signed char)get_VAL(); }
-
-  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
-
-  INLINE int to_short() const { return (short)get_VAL(); }
-
-  INLINE int to_int() const { return (int)get_VAL(); }
-
-  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
-
-  INLINE long to_long() const { return (long)get_VAL(); }
-
-  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
-
-  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
-
-  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
-
-  INLINE double to_double() const {
-    if (isNegative())
-      return roundToDouble(true);
-    else
-      return roundToDouble(false);
-  }
-
-  INLINE unsigned length() const { return _AP_W; }
-
-  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
-   * versa*/
-  INLINE ap_private& reverse() {
-    for (int i = 0; i < _AP_W / 2; ++i) {
-      bool tmp = operator[](i);
-      if (operator[](_AP_W - 1 - i))
-        set(i);
-      else
-        clear(i);
-      if (tmp)
-        set(_AP_W - 1 - i);
-      else
-        clear(_AP_W - 1 - i);
-    }
-    clearUnusedBits();
-    return *this;
-  }
-
-  /*Return true if the value of ap_private instance is zero*/
-  INLINE bool iszero() const { return isMinValue(); }
-
-  INLINE bool to_bool() const { return !iszero(); }
-
-  /* x < 0 */
-  INLINE bool sign() const {
-    if (isNegative()) return true;
-    return false;
-  }
-
-  /* x[i] = !x[i] */
-  INLINE void invert(int i) {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    flip(i);
-  }
-
-  /* x[i] */
-  INLINE bool test(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return operator[](i);
-  }
-
-  // Set the ith bit into v
-  INLINE void set(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // Set the ith bit into v
-  INLINE void set_bit(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // FIXME different argument for different action?
-  INLINE ap_private& set(uint32_t bitPosition) {
-    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
-    clearUnusedBits();
-    return *this;
-  }
-
-  INLINE void set() {
-    for (int i = 0; i < _AP_N; ++i) pVal[i] = ~0ULL;
-    clearUnusedBits();
-  }
-
-  // Get the value of ith bit
-  INLINE bool get(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
-  }
-
-  // Get the value of ith bit
-  INLINE bool get_bit(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the left
-  INLINE void lrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(shl(n) | lshr(_AP_W - n));
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the right
-  INLINE void rrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(lshr(n) | shl(_AP_W - n));
-  }
-
-  /// Set the given bit to 0 whose position is given as "bitPosition".
-  /// @brief Set a given bit to 0.
-  INLINE ap_private& clear(uint32_t bitPosition) {
-    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// @brief Set every bit to 0.
-  INLINE void clear() { memset(pVal, 0, _AP_N * APINT_WORD_SIZE); }
-
-  /// @brief Toggle every bit to its opposite value.
-  ap_private& flip() {
-    for (int i = 0; i < _AP_N; ++i) pVal[i] ^= ~0ULL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// @brief Toggles a given bit to its opposite value.
-  INLINE ap_private& flip(uint32_t bitPosition) {
-    assert(bitPosition < BitWidth && "Out of the bit-width range!");
-    set_bit(bitPosition, !get_bit(bitPosition));
-    return *this;
-  }
-
-  // complements every bit
-  INLINE void b_not() { flip(); }
-
-  INLINE ap_private getLoBits(uint32_t numBits) const {
-    return ap_private_ops::lshr(ap_private_ops::shl(*this, _AP_W - numBits),
-                                _AP_W - numBits);
-  }
-
-  INLINE ap_private getHiBits(uint32_t numBits) const {
-    return ap_private_ops::lshr(*this, _AP_W - numBits);
-  }
-
-  // Binary Arithmetic
-  //-----------------------------------------------------------
-
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this & a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this | a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this ^ a2.get();
-//  }
-
-/// Arithmetic assign
-//-------------------------------------------------------------
-
-#define OP_BIN_LOGIC_ASSIGN_AP(Sym)                                            \
-  template <int _AP_W1, bool _AP_S1>                                           \
-  INLINE ap_private& operator Sym(const ap_private<_AP_W1, _AP_S1>& RHS) {     \
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                      \
-    uint32_t numWords = AESL_std::min((int)_AP_N, _AP_N1);                     \
-    uint32_t i;                                                                \
-    if (_AP_W != _AP_W1)                                                       \
-      fprintf(stderr,                                                          \
-              "Warning! Bitsize mismach for ap_[u]int " #Sym " ap_[u]int.\n"); \
-    for (i = 0; i < numWords; ++i) pVal[i] Sym RHS.get_pVal(i);                \
-    if (_AP_N1 < _AP_N) {                                                      \
-      uint64_t ext = RHS.isNegative() ? ~0ULL : 0;                             \
-      for (; i < _AP_N; i++) pVal[i] Sym ext;                                  \
-    }                                                                          \
-    clearUnusedBits();                                                         \
-    return *this;                                                              \
-  }
-
-  OP_BIN_LOGIC_ASSIGN_AP(&=);
-  OP_BIN_LOGIC_ASSIGN_AP(|=);
-  OP_BIN_LOGIC_ASSIGN_AP(^=);
-#undef OP_BIN_LOGIC_ASSIGN_AP
-
-  /// Adds the RHS APint to this ap_private.
-  /// @returns this, after addition of RHS.
-  /// @brief Addition assignment operator.
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    uint64_t RHSpVal[_AP_N1];
-    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
-    ap_private_ops::add(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
-                        _AP_S1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    uint64_t RHSpVal[_AP_N1];
-    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
-    ap_private_ops::sub(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
-                        _AP_S1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    // Get some bit facts about LHS and check for zero
-    uint32_t lhsBits = getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
-    if (!lhsWords) {
-      // 0 * X ===> 0
-      return *this;
-    }
-
-    ap_private dupRHS = RHS;
-    // Get some bit facts about RHS and check for zero
-    uint32_t rhsBits = dupRHS.getActiveBits();
-    uint32_t rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
-    if (!rhsWords) {
-      // X * 0 ===> 0
-      clear();
-      return *this;
-    }
-
-    // Allocate space for the result
-    uint32_t destWords = rhsWords + lhsWords;
-    uint64_t* dest = (uint64_t*)malloc(destWords * sizeof(uint64_t));
-
-    // Perform the long multiply
-    ap_private_ops::mul(dest, pVal, lhsWords, dupRHS.get_pVal(), rhsWords,
-                        destWords);
-
-    // Copy result back into *this
-    clear();
-    uint32_t wordsToCopy = destWords >= _AP_N ? _AP_N : destWords;
-
-    memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
-
-    uint64_t ext = (isNegative() ^ RHS.isNegative()) ? ~0ULL : 0ULL;
-    for (int i = wordsToCopy; i < _AP_N; i++) pVal[i] = ext;
-    clearUnusedBits();
-    // delete dest array and return
-    free(dest);
-    return *this;
-  }
-
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }
-
-  OP_ASSIGN_AP(/)
-  OP_ASSIGN_AP(%)
-#undef OP_ASSIGN_AP
-
-#define OP_BIN_LOGIC_AP(Sym)                                                  \
-  template <int _AP_W1, bool _AP_S1>                                          \
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator Sym(                  \
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {                          \
-    enum {                                                                    \
-      numWords = (RType<_AP_W1, _AP_S1>::logic_w + APINT_BITS_PER_WORD - 1) / \
-                 APINT_BITS_PER_WORD                                          \
-    };                                                                        \
-    typename RType<_AP_W1, _AP_S1>::logic Result;                             \
-    uint32_t i;                                                               \
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                     \
-    uint32_t min_N = std::min((int)_AP_N, _AP_N1);                            \
-    uint32_t max_N = std::max((int)_AP_N, _AP_N1);                            \
-    for (i = 0; i < min_N; ++i)                                               \
-      Result.set_pVal(i, pVal[i] Sym RHS.get_pVal(i));                        \
-    if (numWords > i) {                                                       \
-      uint64_t ext = ((_AP_N < _AP_N1 && isNegative()) ||                     \
-                      (_AP_N1 < _AP_N && RHS.isNegative()))                   \
-                         ? ~0ULL                                              \
-                         : 0;                                                 \
-      if (_AP_N > _AP_N1)                                                     \
-        for (; i < max_N; i++) Result.set_pVal(i, pVal[i] Sym ext);           \
-      else                                                                    \
-        for (; i < max_N; i++) Result.set_pVal(i, RHS.get_pVal(i) Sym ext);   \
-      if (numWords > i) {                                                     \
-        uint64_t ext2 = ((_AP_N > _AP_N1 && isNegative()) ||                  \
-                         (_AP_N1 > _AP_N && RHS.isNegative()))                \
-                            ? ~0ULL                                           \
-                            : 0;                                              \
-        Result.set_pVal(i, ext Sym ext2);                                     \
-      }                                                                       \
-    }                                                                         \
-    Result.clearUnusedBits();                                                 \
-    return Result;                                                            \
-  }
-
-  OP_BIN_LOGIC_AP(|);
-  OP_BIN_LOGIC_AP(&);
-  OP_BIN_LOGIC_AP(^);
-
-#undef OP_BIN_LOGIC_AP
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    typename RType<_AP_W1, _AP_S1>::plus Result, lhs(*this), rhs(RHS);
-    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::plus_w + 63) / 64;
-    ap_private_ops::add(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
-                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    typename RType<_AP_W1, _AP_S1>::minus Result, lhs(*this), rhs(RHS);
-    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::minus_w + 63) / 64;
-    ap_private_ops::sub(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
-                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    typename RType<_AP_W1, _AP_S1>::mult temp = *this;
-    temp *= RHS;
-    return temp;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    return typename RType<_AP_W2, _AP_S2>::div(
-        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    typename RType<_AP_W2, _AP_S2>::mod res =
-        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
-                                                  : lhs.urem(rhs));
-    return res;
-  }
-
-#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
-  INLINE ap_private operator<<(const TYPE op) const { \
-    if (op >= _AP_W) return ap_private(0);            \
-    if (SIGNED && op < 0) return *this >> (0 - op);   \
-    return shl(op);                                   \
-  }
-
-  OP_LEFT_SHIFT_CTYPE(int, true)
-  // OP_LEFT_SHIFT_CTYPE(bool, false)
-  OP_LEFT_SHIFT_CTYPE(signed char, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
-  OP_LEFT_SHIFT_CTYPE(short, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
-  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
-  OP_LEFT_SHIFT_CTYPE(long, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
-  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
-  OP_LEFT_SHIFT_CTYPE(long long, true)
-#if 0
-  OP_LEFT_SHIFT_CTYPE(half, false)
-  OP_LEFT_SHIFT_CTYPE(float, false)
-  OP_LEFT_SHIFT_CTYPE(double, false)
-#endif
-#undef OP_LEFT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this << sh;
-    } else {
-      int sh = op2.to_int();
-      return *this << sh;
-    }
-  }
-
-#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
-  INLINE ap_private operator>>(const TYPE op) const { \
-    if (op >= _AP_W) {                                \
-      if (isNegative())                               \
-        return ap_private(-1);                        \
-      else                                            \
-        return ap_private(0);                         \
-    }                                                 \
-    if ((SIGNED) && op < 0) return *this << (0 - op); \
-    if (_AP_S)                                        \
-      return ashr(op);                                \
-    else                                              \
-      return lshr(op);                                \
-  }
-
-  // OP_RIGHT_SHIFT_CTYPE(bool, false)
-  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
-  OP_RIGHT_SHIFT_CTYPE(signed char, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
-  OP_RIGHT_SHIFT_CTYPE(short, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
-  OP_RIGHT_SHIFT_CTYPE(int, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
-  OP_RIGHT_SHIFT_CTYPE(long, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
-  OP_RIGHT_SHIFT_CTYPE(long long, true)
-#if 0
-  OP_RIGHT_SHIFT_CTYPE(half, false)
-  OP_RIGHT_SHIFT_CTYPE(float, false)
-  OP_RIGHT_SHIFT_CTYPE(double, false)
-#endif
-#undef OP_RIGHT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this >> sh;
-    } else {
-      int sh = op2.to_int();
-      return *this >> sh;
-    }
-  }
-
-  /// Shift assign
-  //------------------------------------------------------------------
-  // TODO call clearUnusedBits ?
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(int op) {                               \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }                                                                          \
-  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }                                                                          \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }
-  OP_ASSIGN_AP(>>)
-  OP_ASSIGN_AP(<<)
-#undef OP_ASSIGN_AP
-
-  /// Comparisons
-  //-----------------------------------------------------------------
-  INLINE bool operator==(const ap_private& RHS) const {
-    // Get some facts about the number of bits used in the two operands.
-    uint32_t n1 = getActiveBits();
-    uint32_t n2 = RHS.getActiveBits();
-
-    // If the number of bits isn't the same, they aren't equal
-    if (n1 != n2) return false;
-
-    // If the number of bits fits in a word, we only need to compare the low
-    // word.
-    if (n1 <= APINT_BITS_PER_WORD) return pVal[0] == RHS.get_pVal(0);
-
-    // Otherwise, compare everything
-    for (int i = whichWord(n1 - 1); i >= 0; --i)
-      if (pVal[i] != RHS.get_pVal(i)) return false;
-    return true;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W, _AP_W2),
-    };
-    ap_private<_AP_MAX_W, false> lhs(*this);
-    ap_private<_AP_MAX_W, false> rhs(op);
-    return lhs == rhs;
-  }
-
-  INLINE bool operator==(uint64_t Val) const {
-    uint32_t n = getActiveBits();
-    if (n <= APINT_BITS_PER_WORD)
-      return pVal[0] == Val;
-    else
-      return false;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this == op);
-  }
-
-  template <bool _AP_S1>
-  INLINE bool operator!=(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !((*this) == RHS);
-  }
-
-  INLINE bool operator!=(uint64_t Val) const { return !((*this) == Val); }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this > op);
-  }
-
-  INLINE bool operator<(const ap_private& op) const {
-    return _AP_S ? slt(op) : ult(op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
-    else if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ult(rhs);
-      else
-        return lhs.slt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ult(rhs);
-    else
-      return lhs.slt(rhs);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this < op);
-  }
-
-  INLINE bool operator>(const ap_private& op) const {
-    return _AP_S ? sgt(op) : ugt(op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
-    else if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ugt(rhs);
-      else
-        return lhs.sgt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ugt(rhs);
-    else
-      return lhs.sgt(rhs);
-  }
-
-  /// Bit and Part Select
-  //--------------------------------------------------------------
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> range(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> range(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return _private_range_ref<_AP_W, _AP_S>(const_cast<ap_private*>(this), Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-//                                                                         a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-//                                                                       a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                    &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-//            a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(
-//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
-//  }
-
-  INLINE ap_private<_AP_W, false> get() const {
-    ap_private<_AP_W, false> ret(*this);
-    return ret;
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_private<_AP_W3, false>& val) {
-    operator=(ap_private<_AP_W3, _AP_S>(val));
-  }
-
-  ///
-  /// @name Value Tests
-  ///
-  /// This tests the high bit of this ap_private to determine if it is set.
-  /// @returns true if this ap_private is negative, false otherwise
-  /// @brief Determine sign of this ap_private.
-  INLINE bool isNegative() const {
-    // just for get rid of warnings
-    enum { shift = (_AP_W - APINT_BITS_PER_WORD * (_AP_N - 1) - 1) };
-    static const uint64_t mask = 1ULL << (shift);
-    return _AP_S && (pVal[_AP_N - 1] & mask);
-  }
-
-  /// This tests the high bit of the ap_private to determine if it is unset.
-  /// @brief Determine if this ap_private Value is positive (not negative).
-  INLINE bool isPositive() const { return !isNegative(); }
-
-  /// This tests if the value of this ap_private is strictly positive (> 0).
-  /// @returns true if this ap_private is Positive and not zero.
-  /// @brief Determine if this ap_private Value is strictly positive.
-  INLINE bool isStrictlyPositive() const {
-    return isPositive() && (*this) != 0;
-  }
-
-  /// This checks to see if the value has all bits of the ap_private are set or
-  /// not.
-  /// @brief Determine if all bits are set
-  INLINE bool isAllOnesValue() const { return countPopulation() == _AP_W; }
-
-  /// This checks to see if the value of this ap_private is the maximum unsigned
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the largest unsigned value.
-  INLINE bool isMaxValue() const { return countPopulation() == _AP_W; }
-
-  /// This checks to see if the value of this ap_private is the maximum signed
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the largest signed value.
-  INLINE bool isMaxSignedValue() const {
-    return !isNegative() && countPopulation() == _AP_W - 1;
-  }
-
-  /// This checks to see if the value of this ap_private is the minimum unsigned
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the smallest unsigned value.
-  INLINE bool isMinValue() const { return countPopulation() == 0; }
-
-  /// This checks to see if the value of this ap_private is the minimum signed
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the smallest signed value.
-  INLINE bool isMinSignedValue() const {
-    return isNegative() && countPopulation() == 1;
-  }
-
-  /// This function returns a pointer to the internal storage of the ap_private.
-  /// This is useful for writing out the ap_private in binary form without any
-  /// conversions.
-  INLINE const uint64_t* getRawData() const { return &pVal[0]; }
-
-  // Square Root - this method computes and returns the square root of "this".
-  // Three mechanisms are used for computation. For small values (<= 5 bits),
-  // a table lookup is done. This gets some performance for common cases. For
-  // values using less than 52 bits, the value is converted to double and then
-  // the libc sqrt function is called. The result is rounded and then converted
-  // back to a uint64_t which is then used to construct the result. Finally,
-  // the Babylonian method for computing square roots is used.
-  INLINE ap_private sqrt() const {
-    // Determine the magnitude of the value.
-    uint32_t magnitude = getActiveBits();
-
-    // Use a fast table for some small values. This also gets rid of some
-    // rounding errors in libc sqrt for small values.
-    if (magnitude <= 5) {
-      static const uint8_t results[32] = {
-          /*     0 */ 0,
-          /*  1- 2 */ 1, 1,
-          /*  3- 6 */ 2, 2, 2, 2,
-          /*  7-12 */ 3, 3, 3, 3, 3, 3,
-          /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
-          /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-          /*    31 */ 6};
-      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ results[get_VAL()]);
-    }
-
-    // If the magnitude of the value fits in less than 52 bits (the precision of
-    // an IEEE double precision floating point value), then we can use the
-    // libc sqrt function which will probably use a hardware sqrt computation.
-    // This should be faster than the algorithm below.
-    if (magnitude < 52) {
-#ifdef _MSC_VER
-      // Amazingly, VC++ doesn't have round().
-      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
-                                      uint64_t(::sqrt(double(get_VAL()))) +
-                                      0.5);
-#else
-      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
-                                      uint64_t(
-                                          ::round(::sqrt(double(get_VAL())))));
-#endif
-    }
-
-    // Okay, all the short cuts are exhausted. We must compute it. The following
-    // is a classical Babylonian method for computing the square root. This code
-    // was adapted to APINt from a wikipedia article on such computations.
-    // See http://www.wikipedia.org/ and go to the page named
-    // Calculate_an_integer_square_root.
-    uint32_t nbits = BitWidth, i = 4;
-    ap_private<_AP_W, _AP_S> testy(16);
-    ap_private<_AP_W, _AP_S> x_old(/*BitWidth,*/ 1);
-    ap_private<_AP_W, _AP_S> x_new(0);
-    ap_private<_AP_W, _AP_S> two(/*BitWidth,*/ 2);
-
-    // Select a good starting value using binary logarithms.
-    for (;; i += 2, testy = testy.shl(2))
-      if (i >= nbits || this->ule(testy)) {
-        x_old = x_old.shl(i / 2);
-        break;
-      }
-
-    // Use the Babylonian method to arrive at the integer square root:
-    for (;;) {
-      x_new = (this->udiv(x_old) + x_old).udiv(two);
-      if (x_old.ule(x_new)) break;
-      x_old = x_new;
-    }
-
-    // Make sure we return the closest approximation
-    // NOTE: The rounding calculation below is correct. It will produce an
-    // off-by-one discrepancy with results from pari/gp. That discrepancy has
-    // been
-    // determined to be a rounding issue with pari/gp as it begins to use a
-    // floating point representation after 192 bits. There are no discrepancies
-    // between this algorithm and pari/gp for bit widths < 192 bits.
-    ap_private<_AP_W, _AP_S> square(x_old * x_old);
-    ap_private<_AP_W, _AP_S> nextSquare((x_old + 1) * (x_old + 1));
-    if (this->ult(square))
-      return x_old;
-    else if (this->ule(nextSquare)) {
-      ap_private<_AP_W, _AP_S> midpoint((nextSquare - square).udiv(two));
-      ap_private<_AP_W, _AP_S> offset(*this - square);
-      if (offset.ult(midpoint))
-        return x_old;
-      else
-        return x_old + 1;
-    } else
-      assert(0 && "Error in ap_private<_AP_W, _AP_S>::sqrt computation");
-    return x_old + 1;
-  }
-
-  ///
-  /// @Assignment Operators
-  ///
-  /// @returns *this after assignment of RHS.
-  /// @brief Copy assignment operator.
-  INLINE ap_private& operator=(const ap_private& RHS) {
-    if (this != &RHS) memcpy(pVal, RHS.get_pVal(), _AP_N * APINT_WORD_SIZE);
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator=(const volatile ap_private& RHS) {
-    if (this != &RHS)
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE void operator=(const ap_private& RHS) volatile {
-    if (this != &RHS)
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
-    clearUnusedBits();
-  }
-  INLINE void operator=(const volatile ap_private& RHS) volatile {
-    if (this != &RHS)
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
-    clearUnusedBits();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    if (_AP_S1)
-      cpSextOrTrunc(RHS);
-    else
-      cpZextOrTrunc(RHS);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
-    if (_AP_S1)
-      cpSextOrTrunc(RHS);
-    else
-      cpZextOrTrunc(RHS);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    *this = ap_private<_AP_W2, false>(op2);
-    return *this;
-  }
-
-#if 0
-    template<int _AP_W1, bool _AP_S1>
-    INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1, true>& RHS) {
-        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
-        if (RHS.isNegative()) {
-            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
-            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
-        } else {
-            pVal[0] = RHS.get_VAL();
-            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
-        }
-        clearUnusedBits();
-        return *this;
-    }
-
-    template<int _AP_W1, bool _AP_S1>
-    INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1, true>& RHS) {
-        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
-        if (RHS.isNegative()) {
-            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
-            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
-        } else {
-            pVal[0] = RHS.get_VAL();
-            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
-        }
-        clearUnusedBits();
-        return *this;
-    }
-#endif
-
-/// from all c types.
-#define ASSIGN_OP_FROM_INT(C_TYPE, _AP_W2, _AP_S2) \
-  INLINE ap_private& operator=(const C_TYPE rhs) { \
-    ap_private<(_AP_W2), (_AP_S2)> tmp = rhs;      \
-    operator=(tmp);                                \
-    return *this;                                  \
-  }
-
-  ASSIGN_OP_FROM_INT(bool, 1, false)
-  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  ASSIGN_OP_FROM_INT(signed char, 8, true)
-  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
-  ASSIGN_OP_FROM_INT(short, sizeof(short) * 8, true)
-  ASSIGN_OP_FROM_INT(unsigned short, sizeof(unsigned short) * 8, false)
-  ASSIGN_OP_FROM_INT(int, sizeof(int) * 8, true)
-  ASSIGN_OP_FROM_INT(unsigned int, sizeof(unsigned int) * 8, false)
-  ASSIGN_OP_FROM_INT(long, sizeof(long) * 8, true)
-  ASSIGN_OP_FROM_INT(unsigned long, sizeof(unsigned long) * 8, false)
-  ASSIGN_OP_FROM_INT(ap_slong, sizeof(ap_slong) * 8, true)
-  ASSIGN_OP_FROM_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-#undef ASSIGN_OP_FROM_INT
-
-  /// from c string.
-  // XXX this is a must, to prevent pointer being converted to bool.
-  INLINE ap_private& operator=(const char* s) {
-    ap_private tmp(s); // XXX direct initialization, as ctor is explicit.
-    operator=(tmp);
-    return *this;
-  }
-
-  ///
-  /// @name Unary Operators
-  ///
-  /// @returns a new ap_private value representing *this incremented by one
-  /// @brief Postfix increment operator.
-  INLINE const ap_private operator++(int) {
-    ap_private API(*this);
-    ++(*this);
-    return API;
-  }
-
-  /// @returns *this incremented by one
-  /// @brief Prefix increment operator.
-  INLINE ap_private& operator++() {
-    ap_private_ops::add_1(pVal, pVal, _AP_N, 1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// @returns a new ap_private representing *this decremented by one.
-  /// @brief Postfix decrement operator.
-  INLINE const ap_private operator--(int) {
-    ap_private API(*this);
-    --(*this);
-    return API;
-  }
-
-  /// @returns *this decremented by one.
-  /// @brief Prefix decrement operator.
-  INLINE ap_private& operator--() {
-    ap_private_ops::sub_1(pVal, _AP_N, 1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// Performs a bitwise complement operation on this ap_private.
-  /// @returns an ap_private that is the bitwise complement of *this
-  /// @brief Unary bitwise complement operator.
-  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
-    ap_private<_AP_W + !_AP_S, true> Result(*this);
-    Result.flip();
-    return Result;
-  }
-
-  /// Negates *this using two's complement logic.
-  /// @returns An ap_private value representing the negation of *this.
-  /// @brief Unary negation operator
-  INLINE typename RType<1, false>::minus operator-() const {
-    return ap_private<1, false>(0) - (*this);
-  }
-
-  /// Performs logical negation operation on this ap_private.
-  /// @returns true if *this is zero, false otherwise.
-  /// @brief Logical negation operator.
-  INLINE bool operator!() const {
-    for (int i = 0; i < _AP_N; ++i)
-      if (pVal[i]) return false;
-    return true;
-  }
-
-  template <bool _AP_S1>
-  INLINE ap_private<_AP_W, _AP_S || _AP_S1> And(
-      const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return this->operator&(RHS);
-  }
-  template <bool _AP_S1>
-  INLINE ap_private Or(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return this->operator|(RHS);
-  }
-  template <bool _AP_S1>
-  INLINE ap_private Xor(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return this->operator^(RHS);
-  }
-
-  INLINE ap_private Mul(const ap_private& RHS) const {
-    ap_private Result(*this);
-    Result *= RHS;
-    return Result;
-  }
-
-  INLINE ap_private Add(const ap_private& RHS) const {
-    ap_private Result(0);
-    ap_private_ops::add(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
-                        _AP_N, _AP_S, _AP_S);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  INLINE ap_private Sub(const ap_private& RHS) const {
-    ap_private Result(0);
-    ap_private_ops::sub(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
-                        _AP_N, _AP_S, _AP_S);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  /// Arithmetic right-shift this ap_private by shiftAmt.
-  /// @brief Arithmetic right-shift function.
-  INLINE ap_private ashr(uint32_t shiftAmt) const {
-    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
-    // Handle a degenerate case
-    if (shiftAmt == 0) return ap_private(*this);
-
-    // If all the bits were shifted out, the result is, technically, undefined.
-    // We return -1 if it was negative, 0 otherwise. We check this early to
-    // avoid
-    // issues in the algorithm below.
-    if (shiftAmt == BitWidth) {
-      if (isNegative())
-        return ap_private(-1);
-      else
-        return ap_private(0);
-    }
-
-    // Create some space for the result.
-    ap_private Retval(0);
-    uint64_t* val = Retval.get_pVal();
-
-    // Compute some values needed by the following shift algorithms
-    uint32_t wordShift =
-        shiftAmt % APINT_BITS_PER_WORD;               // bits to shift per word
-    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
-    uint32_t breakWord = _AP_N - 1 - offset;          // last word affected
-    uint32_t bitsInWord = whichBit(BitWidth); // how many bits in last word?
-    if (bitsInWord == 0) bitsInWord = APINT_BITS_PER_WORD;
-
-    // If we are shifting whole words, just move whole words
-    if (wordShift == 0) {
-      // Move the words containing significant bits
-      for (uint32_t i = 0; i <= breakWord; ++i)
-        val[i] = pVal[i + offset]; // move whole word
-
-      // Adjust the top significant word for sign bit fill, if negative
-      if (isNegative())
-        if (bitsInWord < APINT_BITS_PER_WORD)
-          val[breakWord] |= ~0ULL << (bitsInWord); // set high bits
-    } else {
-      // Shift the low order words
-      for (uint32_t i = 0; i < breakWord; ++i) {
-        // This combines the shifted corresponding word with the low bits from
-        // the next word (shifted into this word's high bits).
-        val[i] = ((pVal[i + offset]) >> (wordShift));
-        val[i] |= ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
-      }
-
-      // Shift the break word. In this case there are no bits from the next word
-      // to include in this word.
-      val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
-
-      // Deal with sign extenstion in the break word, and possibly the word
-      // before
-      // it.
-      if (isNegative()) {
-        if (wordShift > bitsInWord) {
-          if (breakWord > 0)
-            val[breakWord - 1] |=
-                ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
-          val[breakWord] |= ~0ULL;
-        } else
-          val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
-      }
-    }
-
-    // Remaining words are 0 or -1, just assign them.
-    uint64_t fillValue = (isNegative() ? ~0ULL : 0);
-    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = fillValue;
-    Retval.clearUnusedBits();
-    return Retval;
-  }
-
-  /// Logical right-shift this ap_private by shiftAmt.
-  /// @brief Logical right-shift function.
-  INLINE ap_private lshr(uint32_t shiftAmt) const {
-    // If all the bits were shifted out, the result is 0. This avoids issues
-    // with shifting by the size of the integer type, which produces undefined
-    // results. We define these "undefined results" to always be 0.
-    if (shiftAmt == BitWidth) return ap_private(0);
-
-    // If none of the bits are shifted out, the result is *this. This avoids
-    // issues with shifting byt he size of the integer type, which produces
-    // undefined results in the code below. This is also an optimization.
-    if (shiftAmt == 0) return ap_private(*this);
-
-    // Create some space for the result.
-    ap_private Retval(0);
-    uint64_t* val = Retval.get_pVal();
-
-    // If we are shifting less than a word, compute the shift with a simple
-    // carry
-    if (shiftAmt < APINT_BITS_PER_WORD) {
-      uint64_t carry = 0;
-      for (int i = _AP_N - 1; i >= 0; --i) {
-        val[i] = ((pVal[i]) >> (shiftAmt)) | carry;
-        carry = (pVal[i]) << (APINT_BITS_PER_WORD - shiftAmt);
-      }
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Compute some values needed by the remaining shift algorithms
-    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
-    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
-
-    // If we are shifting whole words, just move whole words
-    if (wordShift == 0) {
-      for (uint32_t i = 0; i < _AP_N - offset; ++i) val[i] = pVal[i + offset];
-      for (uint32_t i = _AP_N - offset; i < _AP_N; i++) val[i] = 0;
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Shift the low order words
-    uint32_t breakWord = _AP_N - offset - 1;
-    for (uint32_t i = 0; i < breakWord; ++i)
-      val[i] = ((pVal[i + offset]) >> (wordShift)) |
-               ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
-    // Shift the break word.
-    val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
-
-    // Remaining words are 0
-    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = 0;
-    Retval.clearUnusedBits();
-    return Retval;
-  }
-
-  /// Left-shift this ap_private by shiftAmt.
-  /// @brief Left-shift function.
-  INLINE ap_private shl(uint32_t shiftAmt) const {
-    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
-    // If all the bits were shifted out, the result is 0. This avoids issues
-    // with shifting by the size of the integer type, which produces undefined
-    // results. We define these "undefined results" to always be 0.
-    if (shiftAmt == BitWidth) return ap_private(0);
-
-    // If none of the bits are shifted out, the result is *this. This avoids a
-    // lshr by the words size in the loop below which can produce incorrect
-    // results. It also avoids the expensive computation below for a common
-    // case.
-    if (shiftAmt == 0) return ap_private(*this);
-
-    // Create some space for the result.
-    ap_private Retval(0);
-    uint64_t* val = Retval.get_pVal();
-    // If we are shifting less than a word, do it the easy way
-    if (shiftAmt < APINT_BITS_PER_WORD) {
-      uint64_t carry = 0;
-      for (int i = 0; i < _AP_N; i++) {
-        val[i] = ((pVal[i]) << (shiftAmt)) | carry;
-        carry = (pVal[i]) >> (APINT_BITS_PER_WORD - shiftAmt);
-      }
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Compute some values needed by the remaining shift algorithms
-    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
-    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
-
-    // If we are shifting whole words, just move whole words
-    if (wordShift == 0) {
-      for (uint32_t i = 0; i < offset; i++) val[i] = 0;
-      for (int i = offset; i < _AP_N; i++) val[i] = pVal[i - offset];
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Copy whole words from this to Result.
-    uint32_t i = _AP_N - 1;
-    for (; i > offset; --i)
-      val[i] = (pVal[i - offset]) << (wordShift) |
-               (pVal[i - offset - 1]) >> (APINT_BITS_PER_WORD - wordShift);
-    val[offset] = (pVal[0]) << (wordShift);
-    for (i = 0; i < offset; ++i) val[i] = 0;
-    Retval.clearUnusedBits();
-    return Retval;
-  }
-
-  INLINE ap_private rotl(uint32_t rotateAmt) const {
-    if (rotateAmt == 0) return ap_private(*this);
-    // Don't get too fancy, just use existing shift/or facilities
-    ap_private hi(*this);
-    ap_private lo(*this);
-    hi.shl(rotateAmt);
-    lo.lshr(BitWidth - rotateAmt);
-    return hi | lo;
-  }
-
-  INLINE ap_private rotr(uint32_t rotateAmt) const {
-    if (rotateAmt == 0) return ap_private(*this);
-    // Don't get too fancy, just use existing shift/or facilities
-    ap_private hi(*this);
-    ap_private lo(*this);
-    lo.lshr(rotateAmt);
-    hi.shl(BitWidth - rotateAmt);
-    return hi | lo;
-  }
-
-  /// Perform an unsigned divide operation on this ap_private by RHS. Both this
-  /// and
-  /// RHS are treated as unsigned quantities for purposes of this division.
-  /// @returns a new ap_private value containing the division result
-  /// @brief Unsigned division operation.
-  INLINE ap_private udiv(const ap_private& RHS) const {
-    // Get some facts about the LHS and RHS number of bits and words
-    uint32_t rhsBits = RHS.getActiveBits();
-    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
-    assert(rhsWords && "Divided by zero???");
-    uint32_t lhsBits = this->getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
-
-    // Deal with some degenerate cases
-    if (!lhsWords)
-      // 0 / X ===> 0
-      return ap_private(0);
-    else if (lhsWords < rhsWords || this->ult(RHS)) {
-      // X / Y ===> 0, iff X < Y
-      return ap_private(0);
-    } else if (*this == RHS) {
-      // X / X ===> 1
-      return ap_private(1);
-    } else if (lhsWords == 1 && rhsWords == 1) {
-      // All high words are zero, just use native divide
-      return ap_private(this->pVal[0] / RHS.get_pVal(0));
-    }
-
-    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-    ap_private Quotient(0); // to hold result.
-    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, &Quotient,
-                           (ap_private*)0);
-    return Quotient;
-  }
-
-  /// Signed divide this ap_private by ap_private RHS.
-  /// @brief Signed division function for ap_private.
-  INLINE ap_private sdiv(const ap_private& RHS) const {
-    if (isNegative())
-      if (RHS.isNegative())
-        return (-(*this)).udiv(-RHS);
-      else
-        return -((-(*this)).udiv(RHS));
-    else if (RHS.isNegative())
-      return -(this->udiv((ap_private)(-RHS)));
-    return this->udiv(RHS);
-  }
-
-  /// Perform an unsigned remainder operation on this ap_private with RHS being
-  /// the
-  /// divisor. Both this and RHS are treated as unsigned quantities for purposes
-  /// of this operation. Note that this is a true remainder operation and not
-  /// a modulo operation because the sign follows the sign of the dividend
-  /// which is *this.
-  /// @returns a new ap_private value containing the remainder result
-  /// @brief Unsigned remainder operation.
-  INLINE ap_private urem(const ap_private& RHS) const {
-    // Get some facts about the LHS
-    uint32_t lhsBits = getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
-
-    // Get some facts about the RHS
-    uint32_t rhsBits = RHS.getActiveBits();
-    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
-    assert(rhsWords && "Performing remainder operation by zero ???");
-
-    // Check the degenerate cases
-    if (lhsWords == 0) {
-      // 0 % Y ===> 0
-      return ap_private(0);
-    } else if (lhsWords < rhsWords || this->ult(RHS)) {
-      // X % Y ===> X, iff X < Y
-      return *this;
-    } else if (*this == RHS) {
-      // X % X == 0;
-      return ap_private(0);
-    } else if (lhsWords == 1) {
-      // All high words are zero, just use native remainder
-      return ap_private(pVal[0] % RHS.get_pVal(0));
-    }
-
-    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-    ap_private Remainder(0);
-    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, (ap_private*)(0),
-                           &Remainder);
-    return Remainder;
-  }
-
-  INLINE ap_private urem(uint64_t RHS) const {
-    // Get some facts about the LHS
-    uint32_t lhsBits = getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
-    // Get some facts about the RHS
-    uint32_t rhsWords = 1; //! rhsBits ? 0 : (ap_private<_AP_W,
-                           //! _AP_S>::whichWord(rhsBits - 1) + 1);
-    assert(rhsWords && "Performing remainder operation by zero ???");
-    // Check the degenerate cases
-    if (lhsWords == 0) {
-      // 0 % Y ===> 0
-      return ap_private(0);
-    } else if (lhsWords < rhsWords || this->ult(RHS)) {
-      // X % Y ===> X, iff X < Y
-      return *this;
-    } else if (*this == RHS) {
-      // X % X == 0;
-      return ap_private(0);
-    } else if (lhsWords == 1) {
-      // All high words are zero, just use native remainder
-      return ap_private(pVal[0] % RHS);
-    }
-
-    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-    ap_private Remainder(0);
-    divide(*this, lhsWords, RHS, (ap_private*)(0), &Remainder);
-    return Remainder;
-  }
-
-  /// Signed remainder operation on ap_private.
-  /// @brief Function for signed remainder operation.
-  INLINE ap_private srem(const ap_private& RHS) const {
-    if (isNegative()) {
-      ap_private lhs = -(*this);
-      if (RHS.isNegative()) {
-        ap_private rhs = -RHS;
-        return -(lhs.urem(rhs));
-      } else
-        return -(lhs.urem(RHS));
-    } else if (RHS.isNegative()) {
-      ap_private rhs = -RHS;
-      return this->urem(rhs);
-    }
-    return this->urem(RHS);
-  }
-
-  /// Signed remainder operation on ap_private.
-  /// @brief Function for signed remainder operation.
-  INLINE ap_private srem(int64_t RHS) const {
-    if (isNegative())
-      if (RHS < 0)
-        return -((-(*this)).urem(-RHS));
-      else
-        return -((-(*this)).urem(RHS));
-    else if (RHS < 0)
-      return this->urem(-RHS);
-    return this->urem(RHS);
-  }
-
-  /// Compares this ap_private with RHS for the validity of the equality
-  /// relationship.
-  /// @returns true if *this == Val
-  /// @brief Equality comparison.
-  template <bool _AP_S1>
-  INLINE bool eq(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return (*this) == RHS;
-  }
-
-  /// Compares this ap_private with RHS for the validity of the inequality
-  /// relationship.
-  /// @returns true if *this != Val
-  /// @brief Inequality comparison
-  template <bool _AP_S1>
-  INLINE bool ne(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !((*this) == RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the less-than relationship.
-  /// @returns true if *this < RHS when both are considered unsigned.
-  /// @brief Unsigned less than comparison
-  template <bool _AP_S1>
-  INLINE bool ult(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    // Get active bit length of both operands
-    uint32_t n1 = getActiveBits();
-    uint32_t n2 = RHS.getActiveBits();
-
-    // If magnitude of LHS is less than RHS, return true.
-    if (n1 < n2) return true;
-
-    // If magnitude of RHS is greather than LHS, return false.
-    if (n2 < n1) return false;
-
-    // If they bot fit in a word, just compare the low order word
-    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
-      return pVal[0] < RHS.get_pVal(0);
-
-    // Otherwise, compare all words
-    uint32_t topWord = whichWord(AESL_std::max(n1, n2) - 1);
-    for (int i = topWord; i >= 0; --i) {
-      if (pVal[i] > RHS.get_pVal(i)) return false;
-      if (pVal[i] < RHS.get_pVal(i)) return true;
-    }
-    return false;
-  }
-
-  INLINE bool ult(uint64_t RHS) const {
-    // Get active bit length of both operands
-    uint32_t n1 = getActiveBits();
-    uint32_t n2 =
-        64 - ap_private_ops::CountLeadingZeros_64(RHS); // RHS.getActiveBits();
-
-    // If magnitude of LHS is less than RHS, return true.
-    if (n1 < n2) return true;
-
-    // If magnitude of RHS is greather than LHS, return false.
-    if (n2 < n1) return false;
-
-    // If they bot fit in a word, just compare the low order word
-    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
-      return pVal[0] < RHS;
-    assert(0);
-  }
-
-  template <bool _AP_S1>
-  INLINE bool slt(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    ap_private lhs(*this);
-    ap_private<_AP_W, _AP_S1> rhs(RHS);
-    bool lhsNeg = isNegative();
-    bool rhsNeg = rhs.isNegative();
-    if (lhsNeg) {
-      // Sign bit is set so perform two's complement to make it positive
-      lhs.flip();
-      lhs++;
-    }
-    if (rhsNeg) {
-      // Sign bit is set so perform two's complement to make it positive
-      rhs.flip();
-      rhs++;
-    }
-
-    // Now we have unsigned values to compare so do the comparison if necessary
-    // based on the negativeness of the values.
-    if (lhsNeg)
-      if (rhsNeg)
-        return lhs.ugt(rhs);
-      else
-        return true;
-    else if (rhsNeg)
-      return false;
-    else
-      return lhs.ult(rhs);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered unsigned.
-  /// @brief Unsigned less or equal comparison
-  template <bool _AP_S1>
-  INLINE bool ule(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return ult(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered signed.
-  /// @brief Signed less or equal comparison
-  template <bool _AP_S1>
-  INLINE bool sle(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return slt(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered unsigned.
-  /// @brief Unsigned greather than comparison
-  template <bool _AP_S1>
-  INLINE bool ugt(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !ult(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered signed.
-  /// @brief Signed greather than comparison
-  template <bool _AP_S1>
-  INLINE bool sgt(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !slt(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered unsigned.
-  /// @brief Unsigned greater or equal comparison
-  template <bool _AP_S1>
-  INLINE bool uge(const ap_private<_AP_W, _AP_S>& RHS) const {
-    return !ult(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered signed.
-  /// @brief Signed greather or equal comparison
-  template <bool _AP_S1>
-  INLINE bool sge(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !slt(RHS);
-  }
-
-  // Sign extend to a new width.
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpSext(const ap_private<_AP_W1, _AP_S1>& that) {
-    assert(_AP_W1 < BitWidth && "Invalid ap_private SignExtend request");
-    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
-    // If the sign bit isn't set, this is the same as zext.
-    if (!that.isNegative()) {
-      cpZext(that);
-      return;
-    }
-
-    // The sign bit is set. First, get some facts
-    enum { wordBits = _AP_W1 % APINT_BITS_PER_WORD };
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    // Mask the high order word appropriately
-    if (_AP_N1 == _AP_N) {
-      enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
-      // The extension is contained to the wordsBefore-1th word.
-      static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
-      pVal[_AP_N - 1] |= mask;
-      return;
-    }
-
-    enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
-    // The extension is contained to the wordsBefore-1th word.
-    static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
-    int i;
-    for (i = 0; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
-    pVal[i - 1] |= mask;
-    for (; i < _AP_N - 1; i++) pVal[i] = ~0ULL;
-    pVal[i] = ~0ULL;
-    clearUnusedBits();
-    return;
-  }
-
-  //  Zero extend to a new width.
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpZext(const ap_private<_AP_W1, _AP_S1>& that) {
-    assert(_AP_W1 < BitWidth && "Invalid ap_private ZeroExtend request");
-    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    int i = 0;
-    for (; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
-    for (; i < _AP_N; ++i) pVal[i] = 0;
-    clearUnusedBits();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpZextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
-    if (BitWidth > _AP_W1)
-      cpZext(that);
-    else {
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
-      clearUnusedBits();
-    }
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpSextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
-    if (BitWidth > _AP_W1)
-      cpSext(that);
-    else {
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
-      clearUnusedBits();
-    }
-  }
-
-  /// @}
-  /// @name Value Characterization Functions
-  /// @{
-
-  /// @returns the total number of bits.
-  INLINE uint32_t getBitWidth() const { return BitWidth; }
-
-  /// Here one word's bitwidth equals to that of uint64_t.
-  /// @returns the number of words to hold the integer value of this ap_private.
-  /// @brief Get the number of words.
-  INLINE uint32_t getNumWords() const {
-    return (BitWidth + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
-  }
-
-  /// This function returns the number of active bits which is defined as the
-  /// bit width minus the number of leading zeros. This is used in several
-  /// computations to see how "wide" the value is.
-  /// @brief Compute the number of active bits in the value
-  INLINE uint32_t getActiveBits() const {
-    uint32_t bits = BitWidth - countLeadingZeros();
-    return bits ? bits : 1;
-  }
-
-  /// This method attempts to return the value of this ap_private as a zero
-  /// extended
-  /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
-  /// uint64_t. Otherwise an assertion will result.
-  /// @brief Get zero extended value
-  INLINE uint64_t getZExtValue() const {
-    assert(getActiveBits() <= 64 && "Too many bits for uint64_t");
-    return *pVal;
-  }
-
-  /// This method attempts to return the value of this ap_private as a sign
-  /// extended
-  /// int64_t. The bit width must be <= 64 or the value must fit within an
-  /// int64_t. Otherwise an assertion will result.
-  /// @brief Get sign extended value
-  INLINE int64_t getSExtValue() const {
-    assert(getActiveBits() <= 64 && "Too many bits for int64_t");
-    return int64_t(pVal[0]);
-  }
-
-  /// This method determines how many bits are required to hold the ap_private
-  /// equivalent of the string given by \p str of length \p slen.
-  /// @brief Get bits required for string value.
-  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
-                                       uint8_t radix) {
-    assert(str != 0 && "Invalid value string");
-    assert(slen > 0 && "Invalid string length");
-
-    // Each computation below needs to know if its negative
-    uint32_t isNegative = str[0] == '-';
-    if (isNegative) {
-      slen--;
-      str++;
-    }
-    // For radixes of power-of-two values, the bits required is accurately and
-    // easily computed
-    if (radix == 2) return slen + isNegative;
-    if (radix == 8) return slen * 3 + isNegative;
-    if (radix == 16) return slen * 4 + isNegative;
-
-    // Otherwise it must be radix == 10, the hard case
-    assert(radix == 10 && "Invalid radix");
-
-    // Convert to the actual binary value.
-    // ap_private<_AP_W, _AP_S> tmp(sufficient, str, slen, radix);
-
-    // Compute how many bits are required.
-    // return isNegative + tmp.logBase2() + 1;
-    return isNegative + slen * 4;
-  }
-
-  /// countLeadingZeros - This function is an ap_private version of the
-  /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number
-  /// of zeros from the most significant bit to the first one bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the most significant bit to the first
-  /// one bits.
-  INLINE uint32_t countLeadingZeros() const {
-    enum {
-      msw_bits = (BitWidth % APINT_BITS_PER_WORD)
-                     ? (BitWidth % APINT_BITS_PER_WORD)
-                     : APINT_BITS_PER_WORD,
-      excessBits = APINT_BITS_PER_WORD - msw_bits
-    };
-    uint32_t Count = ap_private_ops::CountLeadingZeros_64(pVal[_AP_N - 1]);
-    if (Count >= excessBits) Count -= excessBits;
-    if (!pVal[_AP_N - 1]) {
-      for (int i = _AP_N - 1; i; --i) {
-        if (!pVal[i - 1])
-          Count += APINT_BITS_PER_WORD;
-        else {
-          Count += ap_private_ops::CountLeadingZeros_64(pVal[i - 1]);
-          break;
-        }
-      }
-    }
-    return Count;
-  }
-
-  /// countLeadingOnes - This function counts the number of contiguous 1 bits
-  /// in the high order bits. The count stops when the first 0 bit is reached.
-  /// @returns 0 if the high order bit is not set
-  /// @returns the number of 1 bits from the most significant to the least
-  /// @brief Count the number of leading one bits.
-  INLINE uint32_t countLeadingOnes() const {
-    if (isSingleWord())
-      return countLeadingOnes_64(get_VAL(), APINT_BITS_PER_WORD - BitWidth);
-
-    uint32_t highWordBits = BitWidth % APINT_BITS_PER_WORD;
-    uint32_t shift =
-        (highWordBits == 0 ? 0 : APINT_BITS_PER_WORD - highWordBits);
-    int i = _AP_N - 1;
-    uint32_t Count = countLeadingOnes_64(get_pVal(i), shift);
-    if (Count == highWordBits) {
-      for (i--; i >= 0; --i) {
-        if (get_pVal(i) == ~0ULL)
-          Count += APINT_BITS_PER_WORD;
-        else {
-          Count += countLeadingOnes_64(get_pVal(i), 0);
-          break;
-        }
-      }
-    }
-    return Count;
-  }
-
-  /// countTrailingZeros - This function is an ap_private version of the
-  /// countTrailingZoers_{32,64} functions in MathExtras.h. It counts
-  /// the number of zeros from the least significant bit to the first set bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the least significant bit to the first
-  /// one bit.
-  /// @brief Count the number of trailing zero bits.
-  INLINE uint32_t countTrailingZeros() const {
-    uint32_t Count = 0;
-    uint32_t i = 0;
-    for (; i < _AP_N && get_pVal(i) == 0; ++i) Count += APINT_BITS_PER_WORD;
-    if (i < _AP_N) Count += ap_private_ops::CountTrailingZeros_64(get_pVal(i));
-    return AESL_std::min(Count, BitWidth);
-  }
-  /// countPopulation - This function is an ap_private version of the
-  /// countPopulation_{32,64} functions in MathExtras.h. It counts the number
-  /// of 1 bits in the ap_private value.
-  /// @returns 0 if the value is zero.
-  /// @returns the number of set bits.
-  /// @brief Count the number of bits set.
-  INLINE uint32_t countPopulation() const {
-    uint32_t Count = 0;
-    for (int i = 0; i < _AP_N - 1; ++i)
-      Count += ap_private_ops::CountPopulation_64(pVal[i]);
-    Count += ap_private_ops::CountPopulation_64(pVal[_AP_N - 1] & mask);
-    return Count;
-  }
-
-  /// @}
-  /// @name Conversion Functions
-  /// @
-
-  /// This is used internally to convert an ap_private to a string.
-  /// @brief Converts an ap_private to a std::string
-  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
-
-  /// Considers the ap_private to be unsigned and converts it into a string in
-  /// the
-  /// radix given. The radix can be 2, 8, 10 or 16.
-  /// @returns a character interpretation of the ap_private
-  /// @brief Convert unsigned ap_private to string representation.
-  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
-    return toString(radix, false);
-  }
-
-  /// Considers the ap_private to be unsigned and converts it into a string in
-  /// the
-  /// radix given. The radix can be 2, 8, 10 or 16.
-  /// @returns a character interpretation of the ap_private
-  /// @brief Convert unsigned ap_private to string representation.
-  INLINE std::string toStringSigned(uint8_t radix = 10) const {
-    return toString(radix, true);
-  }
-
-  /// @brief Converts this ap_private to a double value.
-  INLINE double roundToDouble(bool isSigned) const {
-    // Handle the simple case where the value is contained in one uint64_t.
-    if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
-      uint64_t val = pVal[0];
-      if (isSigned) {
-        int64_t sext = ((int64_t(val)) << (64 - BitWidth)) >> (64 - BitWidth);
-        return double(sext);
-      } else
-        return double(val);
-    }
-
-    // Determine if the value is negative.
-    bool isNeg = isSigned ? (*this)[BitWidth - 1] : false;
-
-    // Construct the absolute value if we're negative.
-    ap_private<_AP_W, _AP_S> Tmp(isNeg ? -(*this) : (*this));
-
-    // Figure out how many bits we're using.
-    uint32_t n = Tmp.getActiveBits();
-
-    // The exponent (without bias normalization) is just the number of bits
-    // we are using. Note that the sign bit is gone since we constructed the
-    // absolute value.
-    uint64_t exp = n;
-
-    // Return infinity for exponent overflow
-    if (exp > 1023) {
-      if (!isSigned || !isNeg)
-        return std::numeric_limits<double>::infinity();
-      else
-        return -std::numeric_limits<double>::infinity();
-    }
-    exp += 1023; // Increment for 1023 bias
-
-    // Number of bits in mantissa is 52. To obtain the mantissa value, we must
-    // extract the high 52 bits from the correct words in pVal.
-    uint64_t mantissa;
-    unsigned hiWord = whichWord(n - 1);
-    if (hiWord == 0) {
-      mantissa = Tmp.get_pVal(0);
-      if (n > 52)
-        (mantissa) >>= (n - 52); // shift down, we want the top 52 bits.
-    } else {
-      assert(hiWord > 0 && "High word is negative?");
-      uint64_t hibits = (Tmp.get_pVal(hiWord))
-                        << (52 - n % APINT_BITS_PER_WORD);
-      uint64_t lobits =
-          (Tmp.get_pVal(hiWord - 1)) >> (11 + n % APINT_BITS_PER_WORD);
-      mantissa = hibits | lobits;
-    }
-
-    // The leading bit of mantissa is implicit, so get rid of it.
-    uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0;
-    union {
-      double __D;
-      uint64_t __I;
-    } __T;
-    __T.__I = sign | ((exp) << 52) | mantissa;
-    return __T.__D;
-  }
-
-  /// @brief Converts this unsigned ap_private to a double value.
-  INLINE double roundToDouble() const { return roundToDouble(false); }
-
-  /// @brief Converts this signed ap_private to a double value.
-  INLINE double signedRoundToDouble() const { return roundToDouble(true); }
-
-  /// The conversion does not do a translation from integer to double, it just
-  /// re-interprets the bits as a double. Note that it is valid to do this on
-  /// any bit width. Exactly 64 bits will be translated.
-  /// @brief Converts ap_private bits to a double
-  INLINE double bitsToDouble() const {
-    union {
-      uint64_t __I;
-      double __D;
-    } __T;
-    __T.__I = pVal[0];
-    return __T.__D;
-  }
-
-  /// The conversion does not do a translation from integer to float, it just
-  /// re-interprets the bits as a float. Note that it is valid to do this on
-  /// any bit width. Exactly 32 bits will be translated.
-  /// @brief Converts ap_private bits to a double
-  INLINE float bitsToFloat() const {
-    union {
-      uint32_t __I;
-      float __F;
-    } __T;
-    __T.__I = uint32_t(pVal[0]);
-    return __T.__F;
-  }
-
-  /// The conversion does not do a translation from double to integer, it just
-  /// re-interprets the bits of the double. Note that it is valid to do this on
-  /// any bit width but bits from V may get truncated.
-  /// @brief Converts a double to ap_private bits.
-  INLINE ap_private& doubleToBits(double __V) {
-    union {
-      uint64_t __I;
-      double __D;
-    } __T;
-    __T.__D = __V;
-    pVal[0] = __T.__I;
-    return *this;
-  }
-
-  /// The conversion does not do a translation from float to integer, it just
-  /// re-interprets the bits of the float. Note that it is valid to do this on
-  /// any bit width but bits from V may get truncated.
-  /// @brief Converts a float to ap_private bits.
-  INLINE ap_private& floatToBits(float __V) {
-    union {
-      uint32_t __I;
-      float __F;
-    } __T;
-    __T.__F = __V;
-    pVal[0] = __T.__I;
-  }
-
-  // Reduce operation
-  //-----------------------------------------------------------
-  INLINE bool and_reduce() const { return isMaxValue(); }
-
-  INLINE bool nand_reduce() const { return isMinValue(); }
-
-  INLINE bool or_reduce() const { return (bool)countPopulation(); }
-
-  INLINE bool nor_reduce() const { return countPopulation() == 0; }
-
-  INLINE bool xor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? true : false;
-  }
-
-  INLINE bool xnor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? false : true;
-  }
-  INLINE std::string to_string(uint8_t radix = 16, bool sign = false) const {
-    return toString(radix, radix == 10 ? _AP_S : sign);
-  }
-}; // End of class ap_private <_AP_W, _AP_S, false>
-
-namespace ap_private_ops {
-
-enum { APINT_BITS_PER_WORD = 64 };
-template <int _AP_W, bool _AP_S>
-INLINE bool operator==(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
-  return V2 == V1;
-}
-
-template <int _AP_W, bool _AP_S>
-INLINE bool operator!=(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
-  return V2 != V1;
-}
-
-template <int _AP_W, bool _AP_S, int index>
-INLINE bool get(const ap_private<_AP_W, _AP_S>& a) {
-  static const uint64_t mask = 1ULL << (index & 0x3f);
-  return ((mask & a.get_pVal((index) >> 6)) != 0);
-}
-
-template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
-INLINE void set(ap_private<_AP_W, _AP_S>& a,
-                const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
-                const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
-  enum {
-    APINT_BITS_PER_WORD = 64,
-    lsb_word = lsb_index / APINT_BITS_PER_WORD,
-    msb_word = msb_index / APINT_BITS_PER_WORD,
-    msb = msb_index % APINT_BITS_PER_WORD,
-    lsb = lsb_index % APINT_BITS_PER_WORD
-  };
-  if (msb_word == lsb_word) {
-    const uint64_t mask = ~0ULL >>
-                          (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
-                          (APINT_BITS_PER_WORD - msb - 1);
-    // a.set_pVal(msb_word, a.get_pVal(msb_word)  | mask);
-    a.get_pVal(msb_word) |= mask;
-  } else {
-    const uint64_t lsb_mask = ~0ULL >> (lsb) << (lsb);
-    const uint64_t msb_mask = ~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
-                              (APINT_BITS_PER_WORD - msb - 1);
-    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) | lsb_mask);
-    a.get_pVal(lsb_word) |= lsb_mask;
-    for (int i = lsb_word + 1; i < msb_word; i++) {
-      a.set_pVal(i, ~0ULL);
-      // a.get_pVal(i)=0;
-    }
-    // a.set_pVal(msb_word, a.get_pVal(msb_word) | msb_mask);
-
-    a.get_pVal(msb_word) |= msb_mask;
-  }
-  a.clearUnusedBits();
-}
-
-template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
-INLINE void clear(ap_private<_AP_W, _AP_S>& a,
-                  const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
-                  const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
-  enum {
-    APINT_BITS_PER_WORD = 64,
-    lsb_word = lsb_index / APINT_BITS_PER_WORD,
-    msb_word = msb_index / APINT_BITS_PER_WORD,
-    msb = msb_index % APINT_BITS_PER_WORD,
-    lsb = lsb_index % APINT_BITS_PER_WORD
-  };
-  if (msb_word == lsb_word) {
-    const uint64_t mask =
-        ~(~0ULL >> (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
-          (APINT_BITS_PER_WORD - msb - 1));
-    // a.set_pVal(msb_word, a.get_pVal(msb_word) & mask);
-    a.get_pVal(msb_word) &= mask;
-  } else {
-    const uint64_t lsb_mask = ~(~0ULL >> (lsb) << (lsb));
-    const uint64_t msb_mask = ~(~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
-                                (APINT_BITS_PER_WORD - msb - 1));
-    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) & lsb_mask);
-    a.get_pVal(lsb_word) &= lsb_mask;
-    for (int i = lsb_word + 1; i < msb_word; i++) {
-      // a.set_pVal(i, 0);
-      a.get_pVal(i) = 0;
-    }
-    // a.set_pVal(msb_word, a.get_pVal(msb_word) & msb_mask);
-    a.get_pVal(msb_word) &= msb_mask;
-  }
-  a.clearUnusedBits();
-}
-
-template <int _AP_W, bool _AP_S, int index>
-INLINE void set(ap_private<_AP_W, _AP_S>& a,
-                const ap_private<AP_MAX(index, 1), true>& mark = 0) {
-  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
-  static const uint64_t mask = 1ULL << (index % APINT_BITS_PER_WORD);
-  // a.set_pVal(word, a.get_pVal(word) | mask);
-  a.get_pVal(word) |= mask;
-  a.clearUnusedBits();
-}
-
-template <int _AP_W, bool _AP_S, int index>
-INLINE void clear(ap_private<_AP_W, _AP_S>& a,
-                  const ap_private<AP_MAX(index, 1), true>& mark = 0) {
-  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
-  static const uint64_t mask = ~(1ULL << (index % APINT_BITS_PER_WORD));
-  // a.set_pVal(word, a.get_pVal(word) & mask);
-  a.get_pVal(word) &= mask;
-  a.clearUnusedBits();
-}
-
-} // End of ap_private_ops namespace
-
-template <int _AP_W, bool _AP_S>
-INLINE std::string ap_private<_AP_W, _AP_S, false>::toString(
-    uint8_t radix, bool wantSigned) const {
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-         "Radix should be 2, 8, 10, or 16!");
-  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
-                                 "8", "9", "A", "B", "C", "D", "E", "F"};
-  std::string result;
-
-  if (radix != 10) {
-    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
-    // because the number of bits per digit (1,3 and 4 respectively) divides
-    // equaly. We just shift until there value is zero.
-
-    // First, check for a zero value and just short circuit the logic below.
-    if (*this == (uint64_t)(0))
-      result = "0";
-    else {
-      ap_private<_AP_W, false> tmp(*this);
-      size_t insert_at = 0;
-      bool leading_zero = true;
-      if (wantSigned && isNegative()) {
-        // They want to print the signed version and it is a negative value
-        // Flip the bits and add one to turn it into the equivalent positive
-        // value and put a '-' in the result.
-        tmp.flip();
-        tmp++;
-        tmp.clearUnusedBitsToZero();
-        result = "-";
-        insert_at = 1;
-        leading_zero = false;
-      }
-      switch (radix) {
-        case 2:
-          result += "0b";
-          break;
-        case 8:
-          result += "0o";
-          break;
-        case 16:
-          result += "0x";
-          break;
-        default:
-          assert("invalid radix" && 0);
-      }
-      insert_at += 2;
-      // Just shift tmp right for each digit width until it becomes zero
-      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
-      uint64_t mask = radix - 1;
-      ap_private<_AP_W, false> zero(0);
-      unsigned bits = 0;
-      while (tmp.ne(zero)) {
-        uint64_t digit = tmp.get_VAL() & mask;
-        result.insert(insert_at, digits[digit]);
-        tmp = tmp.lshr(shift);
-        ++bits;
-      }
-      bits *= shift;
-      if (bits < _AP_W && leading_zero) result.insert(insert_at, digits[0]);
-    }
-    return result;
-  }
-
-  ap_private<_AP_W, false> tmp(*this);
-  ap_private<_AP_W, false> divisor(radix);
-  ap_private<_AP_W, false> zero(0);
-  size_t insert_at = 0;
-  if (wantSigned && isNegative()) {
-    // They want to print the signed version and it is a negative value
-    // Flip the bits and add one to turn it into the equivalent positive
-    // value and put a '-' in the result.
-    tmp.flip();
-    tmp++;
-    tmp.clearUnusedBitsToZero();
-    result = "-";
-    insert_at = 1;
-  }
-  if (tmp == ap_private<_AP_W, false>(0))
-    result = "0";
-  else
-    while (tmp.ne(zero)) {
-      ap_private<_AP_W, false> APdigit(0);
-      ap_private<_AP_W, false> tmp2(0);
-      ap_private_ops::divide(tmp, tmp.getNumWords(), divisor,
-                             divisor.getNumWords(), &tmp2, &APdigit);
-      uint64_t digit = APdigit.getZExtValue();
-      assert(digit < radix && "divide failed");
-      result.insert(insert_at, digits[digit]);
-      tmp = tmp2;
-    }
-
-  return result;
-} // End of ap_private<_AP_W, _AP_S, false>::toString()
-
-template <int _AP_W, bool _AP_S>
-std::ostream &operator<<(std::ostream &os, const ap_private<_AP_W, _AP_S> &x) {
-  std::ios_base::fmtflags ff = std::cout.flags();
-  if (ff & std::cout.hex) {
-    os << x.toString(16, false); // don't print sign
-  } else if (ff & std::cout.oct) {
-    os << x.toString(8, false); // don't print sign
-  } else {
-    os << x.toString(10, _AP_S);
-  }
-  return os;
-}
-
-// ------------------------------------------------------------ //
-//           XXX moved here from ap_int_sim.h  XXX              //
-// ------------------------------------------------------------ //
-
-/// Concatination reference.
-/// Proxy class which allows concatination to be used as rvalue(for reading) and
-/// lvalue(for writing)
-// ----------------------------------------------------------------
-// template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
-// struct ap_concat_ref {
-//#ifdef _MSC_VER
-//#pragma warning(disable : 4521 4522)
-//#endif
-//  enum {
-//    _AP_WR = _AP_W1 + _AP_W2,
-//  };
-//  _AP_T1& mbv1;
-//  _AP_T2& mbv2;
-//
-//  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>&
-//  ref)
-//      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
-//
-//  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref& operator=(const ap_private<_AP_W3, _AP_S3>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
-//    int W_ref1 = mbv1.length();
-//    int W_ref2 = mbv2.length();
-//    ap_private<_AP_W1, false> mask1(-1);
-//    mask1 >>= _AP_W1 - W_ref1;
-//    ap_private<_AP_W2, false> mask2(-1);
-//    mask2 >>= _AP_W2 - W_ref2;
-//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
-//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
-//    return *this;
-//  }
-//
-//  INLINE ap_concat_ref& operator=(unsigned long long val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-//  INLINE ap_concat_ref& operator=(
-//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  INLINE ap_concat_ref& operator=(
-//      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref& operator=(const _private_bit_ref<_AP_W3, _AP_S3>&
-//  val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref& operator=(const _private_range_ref<_AP_W3, _AP_S3>&
-//  val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref& operator=(
-//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val)
-//      {
-//    return operator=((const ap_private<_AP_W3, false>)(val));
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref& operator=(
-//      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
-//          val) {
-//    return operator=(val.to_ap_private());
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref& operator=(
-//      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
-//    return operator=((unsigned long long)(bool)(val));
-//  }
-//
-//  INLINE operator ap_private<_AP_WR, false>() const { return get(); }
-//
-//  INLINE operator unsigned long long() const { return get().to_uint64(); }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                       _private_range_ref<_AP_W3, _AP_S3> >
-//  operator,(const _private_range_ref<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                         _private_range_ref<_AP_W3, _AP_S3> >(
-//        *this, const_cast<_private_range_ref<_AP_W3, _AP_S3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE
-//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
-//      >
-//      operator,(ap_private<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                         ap_private<_AP_W3, _AP_S3> >(*this, a2);
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE
-//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
-//      >
-//      operator,(const ap_private<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                         ap_private<_AP_W3, _AP_S3> >(
-//        *this, const_cast<ap_private<_AP_W3, _AP_S3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
-//  _AP_S3> >
-//  operator,(const _private_bit_ref<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
-//    _AP_S3> >(
-//        *this, const_cast<_private_bit_ref<_AP_W3, _AP_S3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-//                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
-//  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-//                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
-//        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4,
-//        _AP_T4>&>(a2));
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref<
-//      _AP_WR, ap_concat_ref, _AP_W3,
-//      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
-//  operator,(
-//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2)
-//      {
-//    return ap_concat_ref<
-//        _AP_WR, ap_concat_ref, _AP_W3,
-//        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-//        *this,
-//        const_cast<
-//            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
-//            _AP_N3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE
-//      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
-//                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
-//                    >
-//      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
-//      _AP_N3>
-//                    &a2) {
-//    return ap_concat_ref<
-//        _AP_WR, ap_concat_ref, 1,
-//        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-//        *this,
-//        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
-//        _AP_N3>&>(
-//            a2));
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
-//      const ap_private<_AP_W3, _AP_S3>& a2) {
-//    return get() & a2;
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
-//      const ap_private<_AP_W3, _AP_S3>& a2) {
-//    return get() | a2;
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
-//      const ap_private<_AP_W3, _AP_S3>& a2) {
-//    return ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3>(get() ^ a2);
-//  }
-//
-//  INLINE const ap_private<_AP_WR, false> get() const {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
-//    int W_ref2 = mbv2.length();
-//    tmpVal <<= W_ref2;
-//    tmpVal |= tmpVal2;
-//    return tmpVal;
-//  }
-//
-//  INLINE const ap_private<_AP_WR, false> get() {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
-//    int W_ref2 = mbv2.length();
-//    tmpVal <<= W_ref2;
-//    tmpVal |= tmpVal2;
-//    return tmpVal;
-//  }
-//
-//  template <int _AP_W3>
-//  INLINE void set(const ap_private<_AP_W3, false>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
-//    int W_ref1 = mbv1.length();
-//    int W_ref2 = mbv2.length();
-//    ap_private<_AP_W1, false> mask1(-1);
-//    mask1 >>= _AP_W1 - W_ref1;
-//    ap_private<_AP_W2, false> mask2(-1);
-//    mask2 >>= _AP_W2 - W_ref2;
-//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
-//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
-//  }
-//
-//  INLINE int length() const { return mbv1.length() + mbv2.length(); }
-//
-//  INLINE std::string to_string(uint8_t radix = 2) const {
-//    return get().to_string(radix);
-//  }
-//}; // struct ap_concat_ref.
-
-/// Range(slice) reference
-/// Proxy class, which allows part selection to be used as rvalue(for reading)
-/// and lvalue(for writing)
-//------------------------------------------------------------
-template <int _AP_W, bool _AP_S>
-struct _private_range_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  ap_private<_AP_W, _AP_S>& d_bv;
-  int l_index;
-  int h_index;
-
- public:
-  /// copy ctor.
-  INLINE _private_range_ref(const _private_range_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
-
-  /// direct ctor.
-  INLINE _private_range_ref(ap_private<_AP_W, _AP_S>* bv, int h, int l)
-      : d_bv(*bv), l_index(l), h_index(h) {
-    _AP_WARNING(h < 0 || l < 0,
-                "Higher bound (%d) and lower bound (%d) cannot be "
-                "negative.",
-                h, l);
-    _AP_WARNING(h >= _AP_W || l >= _AP_W,
-                "Higher bound (%d) or lower bound (%d) out of range (%d).", h, l,
-                _AP_W);
-  }
-
-  /// compound or assignment.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
-      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
-                "Bitsize mismach for ap_private<>.range() &= "
-                "ap_private<>.range().");
-    this->d_bv |= ref.d_bv;
-    return *this;
-  }
-
-  /// compound or assignment with root type.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
-      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
-                "Bitsize mismach for ap_private<>.range() |= _AP_ROOT_TYPE<>.");
-    this->d_bv |= ref.V;
-    return *this;
-  }
-
-  /// compound and assignment.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
-      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
-                "Bitsize mismach for ap_private<>.range() &= "
-                "ap_private<>.range().");
-    this->d_bv &= ref.d_bv;
-    return *this;
-  };
-
-  /// compound and assignment with root type.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
-      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
-                "Bitsize mismach for ap_private<>.range() &= _AP_ROOT_TYPE<>.");
-    this->d_bv &= ref.V;
-    return *this;
-  }
-
-  /// compound xor assignment.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
-      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
-                "Bitsize mismach for ap_private<>.range() ^= "
-                "ap_private<>.range().");
-    this->d_bv ^= ref.d_bv;
-    return *this;
-  };
-
-  /// compound xor assignment with root type.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
-      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
-                "Bitsize mismach for ap_private<>.range() ^= _AP_ROOT_TYPE<>.");
-    this->d_bv ^= ref.V;
-    return *this;
-  }
-
-  /// @name convertors.
-  //  @{
-  INLINE operator ap_private<_AP_W, false>() const {
-    ap_private<_AP_W, false> val(0);
-    if (h_index >= l_index) {
-      if (_AP_W > 64) {
-        val = d_bv;
-        ap_private<_AP_W, false> mask(-1);
-        mask >>= _AP_W - (h_index - l_index + 1);
-        val >>= l_index;
-        val &= mask;
-      } else {
-        const static uint64_t mask = (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
-        val = (d_bv >> l_index) & (mask >> (_AP_W - (h_index - l_index + 1)));
-      }
-    } else {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        if ((d_bv)[j]) val.set(i);
-    }
-    return val;
-  }
-
-  INLINE operator unsigned long long() const { return to_uint64(); }
-  //  @}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref& operator=(const ap_private<_AP_W2, _AP_S2>& val) {
-    ap_private<_AP_W, false> vval = ap_private<_AP_W, false>(val);
-    if (l_index > h_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
-    } else {
-      if (_AP_W > 64) {
-        ap_private<_AP_W, false> mask(-1);
-        if (l_index > 0) {
-          mask <<= l_index;
-          vval <<= l_index;
-        }
-        if (h_index < _AP_W - 1) {
-          ap_private<_AP_W, false> mask2(-1);
-          mask2 >>= _AP_W - h_index - 1;
-          mask &= mask2;
-          vval &= mask2;
-        }
-        mask.flip();
-        d_bv &= mask;
-        d_bv |= vval;
-      } else {
-        unsigned shift = 64 - _AP_W;
-        uint64_t mask = ~0ULL >> (shift);
-        if (l_index > 0) {
-          vval = mask & vval << l_index;
-          mask = mask & mask << l_index;
-        }
-        if (h_index < _AP_W - 1) {
-          uint64_t mask2 = mask;
-          mask2 >>= (_AP_W - h_index - 1);
-          mask &= mask2;
-          vval &= mask2;
-        }
-        mask = ~mask;
-        d_bv &= mask;
-        d_bv |= vval;
-      }
-    }
-    return *this;
-  } // operator=(const ap_private<>&)
-
-  INLINE _private_range_ref& operator=(unsigned long long val) {
-    const ap_private<_AP_W, _AP_S> vval = val;
-    return operator=(vval);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref& operator=(
-      const _private_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((unsigned long long)(bool)val);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref& operator=(
-      const _private_range_ref<_AP_W2, _AP_S2>& val) {
-    const ap_private<_AP_W, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-//  INLINE _private_range_ref& operator=(
-//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
-//    const ap_private<_AP_W, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-
-  // TODO from ap_int_base, ap_bit_ref and ap_range_ref.
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE _private_range_ref& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(val.to_ap_int_base().V);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE _private_range_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(val.operator ap_int_base<_AP_W2, false>().V);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE _private_range_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((unsigned long long)(bool)val);
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(
-//        *this, const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  INLINE
-//  ap_concat_ref<_AP_W, _private_range_ref, _AP_W, ap_private<_AP_W, _AP_S> >
-//  operator,(ap_private<_AP_W, _AP_S>& a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W,
-//                         ap_private<_AP_W, _AP_S> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(
-//        *this, const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, _private_range_ref, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(
-//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, _private_range_ref, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        *this,
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, _private_range_ref, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                    &a2) {
-//    return ap_concat_ref<
-//        _AP_W, _private_range_ref, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        *this,
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-//            a2));
-//  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs == rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs != rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs > rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs >= rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs < rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs <= rhs;
-  }
-
-  template <int _AP_W2>
-  INLINE void set(const ap_private<_AP_W2, false>& val) {
-    ap_private<_AP_W, _AP_S> vval = val;
-    if (l_index > h_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
-    } else {
-      if (_AP_W > 64) {
-        ap_private<_AP_W, _AP_S> mask(-1);
-        if (l_index > 0) {
-          ap_private<_AP_W, false> mask1(-1);
-          mask1 >>= _AP_W - l_index;
-          mask1.flip();
-          mask = mask1;
-          // vval&=mask1;
-          vval <<= l_index;
-        }
-        if (h_index < _AP_W - 1) {
-          ap_private<_AP_W, false> mask2(-1);
-          mask2 <<= h_index + 1;
-          mask2.flip();
-          mask &= mask2;
-          vval &= mask2;
-        }
-        mask.flip();
-        d_bv &= mask;
-        d_bv |= vval;
-      } else {
-        uint64_t mask = ~0ULL >> (64 - _AP_W);
-        if (l_index > 0) {
-          uint64_t mask1 = mask;
-          mask1 = mask & (mask1 >> (_AP_W - l_index));
-          vval = mask & (vval << l_index);
-          mask = ~mask1 & mask;
-          // vval&=mask1;
-        }
-        if (h_index < _AP_W - 1) {
-          uint64_t mask2 = ~0ULL >> (64 - _AP_W);
-          mask2 = mask & (mask2 << (h_index + 1));
-          mask &= ~mask2;
-          vval &= ~mask2;
-        }
-        d_bv &= (~mask & (~0ULL >> (64 - _AP_W)));
-        d_bv |= vval;
-      }
-    }
-  }
-
-  INLINE ap_private<_AP_W, false> get() const {
-    ap_private<_AP_W, false> val(0);
-    if (h_index < l_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        if ((d_bv)[j]) val.set(i);
-    } else {
-      val = d_bv;
-      val >>= l_index;
-      if (h_index < _AP_W - 1) {
-        if (_AP_W <= 64) {
-          const static uint64_t mask =
-              (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
-          val &= (mask >> (_AP_W - (h_index - l_index + 1)));
-        } else {
-          ap_private<_AP_W, false> mask(-1);
-          mask >>= _AP_W - (h_index - l_index + 1);
-          val &= mask;
-        }
-      }
-    }
-    return val;
-  }
-
-  INLINE ap_private<_AP_W, false> get() {
-    ap_private<_AP_W, false> val(0);
-    if (h_index < l_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        if ((d_bv)[j]) val.set(i);
-    } else {
-      val = d_bv;
-      val >>= l_index;
-      if (h_index < _AP_W - 1) {
-        if (_AP_W <= 64) {
-          static const uint64_t mask = ~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0);
-          return val &= ((mask) >> (_AP_W - (h_index - l_index + 1)));
-        } else {
-          ap_private<_AP_W, false> mask(-1);
-          mask >>= _AP_W - (h_index - l_index + 1);
-          val &= mask;
-        }
-      }
-    }
-    return val;
-  }
-
-  INLINE int length() const {
-    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
-  }
-
-  INLINE int to_int() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_int();
-  }
-
-  INLINE unsigned int to_uint() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_uint();
-  }
-
-  INLINE long to_long() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_long();
-  }
-
-  INLINE unsigned long to_ulong() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_ulong();
-  }
-
-  INLINE ap_slong to_int64() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_int64();
-  }
-
-  INLINE ap_ulong to_uint64() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_uint64();
-  }
-
-  INLINE std::string to_string(uint8_t radix = 2) const {
-    return get().to_string(radix);
-  }
-
-  INLINE bool and_reduce() {
-    bool ret = true;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) ret &= d_bv[i];
-    return ret;
-  }
-
-  INLINE bool or_reduce() {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) ret |= d_bv[i];
-    return ret;
-  }
-
-  INLINE bool xor_reduce() {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) ret ^= d_bv[i];
-    return ret;
-  }
-}; // struct _private_range_ref.
-
-/// Bit reference
-/// Proxy class, which allows bit selection to be used as rvalue(for reading)
-/// and lvalue(for writing)
-//--------------------------------------------------------------
-template <int _AP_W, bool _AP_S>
-struct _private_bit_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  ap_private<_AP_W, _AP_S>& d_bv;
-  int d_index;
-
- public:
-  // copy ctor.
-  INLINE _private_bit_ref(const _private_bit_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), d_index(ref.d_index) {}
-
-  // director ctor.
-  INLINE _private_bit_ref(ap_private<_AP_W, _AP_S>& bv, int index = 0)
-      : d_bv(bv), d_index(index) {
-    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.\n",
-                d_index);
-    _AP_WARNING(d_index >= _AP_W,
-                "Index of bit vector (%d) out of range (%d).\n", d_index, _AP_W);
-  }
-
-  INLINE operator bool() const { return d_bv.get_bit(d_index); }
-
-  INLINE bool to_bool() const { return operator bool(); }
-
-  template <typename T>
-  INLINE _private_bit_ref& operator=(const T& val) {
-    if (!!val)
-      d_bv.set(d_index);
-    else
-      d_bv.clear(d_index);
-    return *this;
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
-//  _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
-//    _AP_S2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2,
-//  _private_range_ref<_AP_W2,
-//  _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, _AP_W2,
-//    _private_range_ref<_AP_W2,
-//    _AP_S2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref<_AP_W2,
-//  _AP_S2> > operator,(
-//      const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, 1,
-//    _private_bit_ref<_AP_W2, _AP_S2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>
-//  operator,(
-//      const _private_bit_ref &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      1, _private_bit_ref, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//  _AP_N2>
-//                &a2) const {
-//    return ap_concat_ref<
-//        1, _private_bit_ref, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//            _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<1, _private_bit_ref, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//                    _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//      _AP_N2>
-//                    &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, 1, af_bit_ref<_AP_W2,
-//    _AP_I2, _AP_S2,
-//                                                      _AP_Q2, _AP_O2,
-//                                                      _AP_N2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//        _AP_N2>&>(
-//            a2));
-//  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
-    return get() == op.get();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
-    return get() != op.get();
-  }
-
-  INLINE bool get() const { return operator bool(); }
-
-  //  template <int _AP_W3>
-  //  INLINE void set(const ap_private<_AP_W3, false>& val) {
-  //    operator=(val);
-  //  }
-
-  //  INLINE bool operator~() const {
-  //    bool bit = (d_bv)[d_index];
-  //    return bit ? false : true;
-  //  }
-
-  INLINE int length() const { return 1; }
-
-  //  INLINE std::string to_string() const {
-  //    bool val = get();
-  //    return val ? "1" : "0";
-  //  }
-
-}; // struct _private_bit_ref.
-
-// char a[100];
-// char* ptr = a;
-// ap_int<2> n = 3;
-// char* ptr2 = ptr + n*2;
-// avoid ambiguous errors
-#define OP_BIN_MIX_PTR(BIN_OP)                                           \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
-  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                       \
-                                   const ap_private<_AP_W, _AP_S>& op) { \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
-    return i_op BIN_OP op2;                                              \
-  }                                                                      \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
-  INLINE PTR_TYPE* operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
-                                   PTR_TYPE* i_op) {                     \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
-    return op2 BIN_OP i_op;                                              \
-  }
-
-OP_BIN_MIX_PTR(+)
-OP_BIN_MIX_PTR(-)
-#undef OP_BIN_MIX_PTR
-
-// float OP ap_int
-// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
-#define OP_BIN_MIX_FLOAT(BIN_OP, C_TYPE)                              \
-  template <int _AP_W, bool _AP_S>                                    \
-  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                          \
-                                const ap_private<_AP_W, _AP_S>& op) { \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
-    return i_op BIN_OP op2;                                           \
-  }                                                                   \
-  template <int _AP_W, bool _AP_S>                                    \
-  INLINE C_TYPE operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
-                                C_TYPE i_op) {                        \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
-    return op2 BIN_OP i_op;                                           \
-  }
-
-#define OPS_MIX_FLOAT(C_TYPE) \
-  OP_BIN_MIX_FLOAT(*, C_TYPE) \
-  OP_BIN_MIX_FLOAT(/, C_TYPE) \
-  OP_BIN_MIX_FLOAT(+, C_TYPE) \
-  OP_BIN_MIX_FLOAT(-, C_TYPE)
-
-OPS_MIX_FLOAT(float)
-OPS_MIX_FLOAT(double)
-#undef OP_BIN_MIX_FLOAT
-#undef OPS_MIX_FLOAT
-
-/// Operators mixing Integers with AP_Int
-// ----------------------------------------------------------------
-
-// partially specialize template argument _AP_C in order that:
-// for _AP_W > 64, we will explicitly convert operand with native data type
-// into corresponding ap_private
-// for _AP_W <= 64, we will implicitly convert operand with ap_private into
-// (unsigned) long long
-#define OP_BIN_MIX_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                  \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_WI, _AP_SI>::template RType<_AP_W, _AP_S>::RTYPE \
-      operator BIN_OP(C_TYPE i_op, const ap_private<_AP_W, _AP_S>& op) {       \
-    return ap_private<_AP_WI, _AP_SI>(i_op).operator BIN_OP(op);               \
-  }                                                                            \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
-      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
-    return op.operator BIN_OP(ap_private<_AP_WI, _AP_SI>(i_op));               \
-  }
-
-#define OP_REL_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                     \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(const ap_private<_AP_W, _AP_S>& op,          \
-                              C_TYPE op2) {                                \
-    return op.operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));            \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(C_TYPE op2,                                  \
-                              const ap_private<_AP_W, _AP_S, false>& op) { \
-    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(op);            \
-  }
-
-#define OP_ASSIGN_MIX_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
-  template <int _AP_W, bool _AP_S>                                 \
-  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(             \
-      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
-    return op.operator ASSIGN_OP(ap_private<_AP_W2, _AP_S2>(op2)); \
-  }
-
-#define OP_BIN_SHIFT_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                \
-  template <int _AP_W, bool _AP_S>                                             \
-  C_TYPE operator BIN_OP(C_TYPE i_op,                                          \
-                         const ap_private<_AP_W, _AP_S, false>& op) {          \
-    return i_op BIN_OP(op.get_VAL());                                          \
-  }                                                                            \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
-      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
-    return op.operator BIN_OP(i_op);                                           \
-  }
-
-#define OP_ASSIGN_RSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
-  template <int _AP_W, bool _AP_S>                              \
-  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
-      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
-    op = op.operator>>(op2);                                    \
-    return op;                                                  \
-  }
-
-#define OP_ASSIGN_LSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
-  template <int _AP_W, bool _AP_S>                              \
-  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
-      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
-    op = op.operator<<(op2);                                    \
-    return op;                                                  \
-  }
-
-#define OPS_MIX_INT(C_TYPE, _AP_W2, _AP_S2)              \
-  OP_BIN_MIX_INT(*, C_TYPE, (_AP_W2), (_AP_S2), mult)    \
-  OP_BIN_MIX_INT(+, C_TYPE, (_AP_W2), (_AP_S2), plus)    \
-  OP_BIN_MIX_INT(-, C_TYPE, (_AP_W2), (_AP_S2), minus)   \
-  OP_BIN_MIX_INT(/, C_TYPE, (_AP_W2), (_AP_S2), div)     \
-  OP_BIN_MIX_INT(%, C_TYPE, (_AP_W2), (_AP_S2), mod)     \
-  OP_BIN_MIX_INT(&, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
-  OP_BIN_MIX_INT(|, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
-  OP_BIN_MIX_INT (^, C_TYPE, (_AP_W2), (_AP_S2), logic)  \
-  OP_BIN_SHIFT_INT(>>, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
-  OP_BIN_SHIFT_INT(<<, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
-                                                         \
-  OP_ASSIGN_MIX_INT(+=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(-=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(*=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(/=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(%=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(&=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(|=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(^=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_RSHIFT_INT(>>=, C_TYPE, (_AP_W2), (_AP_S2))  \
-  OP_ASSIGN_LSHIFT_INT(<<=, C_TYPE, (_AP_W2), (_AP_S2))  \
-                                                         \
-  OP_REL_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))          \
-  OP_REL_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))          \
-  OP_REL_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2))         \
-  OP_REL_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2))         \
-  OP_REL_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2))         \
-  OP_REL_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
-
-OPS_MIX_INT(bool, 1, false)
-OPS_MIX_INT(char, 8, CHAR_IS_SIGNED)
-OPS_MIX_INT(signed char, 8, true)
-OPS_MIX_INT(unsigned char, 8, false)
-OPS_MIX_INT(short, sizeof(short) * 8, true)
-OPS_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
-OPS_MIX_INT(int, sizeof(int) * 8, true)
-OPS_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
-OPS_MIX_INT(long, sizeof(long) * 8, true)
-OPS_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
-OPS_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
-OPS_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-
-#undef OP_BIN_MIX_INT
-#undef OP_BIN_SHIFT_INT
-#undef OP_ASSIGN_MIX_INT
-#undef OP_ASSIGN_RSHIFT_INT
-#undef OP_ASSIGN_LSHIFT_INT
-#undef OP_REL_MIX_INT
-#undef OPS_MIX_INT
-
-#define OP_BIN_MIX_RANGE(BIN_OP, RTYPE)                                     \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
-  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                             _AP_S2>::RTYPE \
-  operator BIN_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,            \
-                  const ap_private<_AP_W2, _AP_S2>& op2) {                  \
-    return ap_private<_AP_W1, false>(op1).operator BIN_OP(op2);             \
-  }                                                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
-  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                             _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                    \
-                  const _private_range_ref<_AP_W2, _AP_S2>& op2) {          \
-    return op1.operator BIN_OP(ap_private<_AP_W2, false>(op2));             \
-  }
-
-#define OP_ASSIGN_MIX_RANGE(ASSIGN_OP)                             \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
-  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(           \
-      ap_private<_AP_W1, _AP_S1>& op1,                             \
-      const _private_range_ref<_AP_W2, _AP_S2>& op2) {             \
-    return op1.operator ASSIGN_OP(ap_private<_AP_W2, false>(op2)); \
-  }                                                                \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
-  INLINE _private_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(   \
-      _private_range_ref<_AP_W1, _AP_S1>& op1,                     \
-      ap_private<_AP_W2, _AP_S2>& op2) {                           \
-    ap_private<_AP_W1, false> tmp(op1);                            \
-    tmp.operator ASSIGN_OP(op2);                                   \
-    op1 = tmp;                                                     \
-    return op1;                                                    \
-  }
-
-#define OP_REL_MIX_RANGE(REL_OP)                                               \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE bool operator REL_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,   \
-                              const ap_private<_AP_W2, _AP_S2>& op2) {         \
-    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);                \
-  }                                                                            \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,           \
-                              const _private_range_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1.operator REL_OP(op2.operator ap_private<_AP_W2, false>());      \
-  }
-
-OP_BIN_MIX_RANGE(+, plus)
-OP_BIN_MIX_RANGE(-, minus)
-OP_BIN_MIX_RANGE(*, mult)
-OP_BIN_MIX_RANGE(/, div)
-OP_BIN_MIX_RANGE(%, mod)
-OP_BIN_MIX_RANGE(&, logic)
-OP_BIN_MIX_RANGE(|, logic)
-OP_BIN_MIX_RANGE(^, logic)
-OP_BIN_MIX_RANGE(>>, arg1)
-OP_BIN_MIX_RANGE(<<, arg1)
-#undef OP_BIN_MIX_RANGE
-
-OP_ASSIGN_MIX_RANGE(+=)
-OP_ASSIGN_MIX_RANGE(-=)
-OP_ASSIGN_MIX_RANGE(*=)
-OP_ASSIGN_MIX_RANGE(/=)
-OP_ASSIGN_MIX_RANGE(%=)
-OP_ASSIGN_MIX_RANGE(&=)
-OP_ASSIGN_MIX_RANGE(|=)
-OP_ASSIGN_MIX_RANGE(^=)
-OP_ASSIGN_MIX_RANGE(>>=)
-OP_ASSIGN_MIX_RANGE(<<=)
-#undef OP_ASSIGN_MIX_RANGE
-
-OP_REL_MIX_RANGE(>)
-OP_REL_MIX_RANGE(<)
-OP_REL_MIX_RANGE(>=)
-OP_REL_MIX_RANGE(<=)
-OP_REL_MIX_RANGE(==)
-OP_REL_MIX_RANGE(!=)
-#undef OP_REL_MIX_RANGE
-
-#define OP_BIN_MIX_BIT(BIN_OP, RTYPE)                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
-  INLINE typename ap_private<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
-  operator BIN_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,                \
-                  const ap_private<_AP_W2, _AP_S2>& op2) {                    \
-    return ap_private<1, false>(op1).operator BIN_OP(op2);                    \
-  }                                                                           \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
-  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
-  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                      \
-                  const _private_bit_ref<_AP_W2, _AP_S2>& op2) {              \
-    return op1.operator BIN_OP(ap_private<1, false>(op2));                    \
-  }
-
-#define OP_ASSIGN_MIX_BIT(ASSIGN_OP)                           \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
-  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(       \
-      ap_private<_AP_W1, _AP_S1>& op1,                         \
-      _private_bit_ref<_AP_W2, _AP_S2>& op2) {                 \
-    return op1.operator ASSIGN_OP(ap_private<1, false>(op2));  \
-  }                                                            \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
-  INLINE _private_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \
-      _private_bit_ref<_AP_W1, _AP_S1>& op1,                   \
-      ap_private<_AP_W2, _AP_S2>& op2) {                       \
-    ap_private<1, false> tmp(op1);                             \
-    tmp.operator ASSIGN_OP(op2);                               \
-    op1 = tmp;                                                 \
-    return op1;                                                \
-  }
-
-#define OP_REL_MIX_BIT(REL_OP)                                               \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,   \
-                              const ap_private<_AP_W2, _AP_S2>& op2) {       \
-    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);              \
-  }                                                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,         \
-                              const _private_bit_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1.operator REL_OP(ap_private<1, false>(op2));                   \
-  }
-
-OP_ASSIGN_MIX_BIT(+=)
-OP_ASSIGN_MIX_BIT(-=)
-OP_ASSIGN_MIX_BIT(*=)
-OP_ASSIGN_MIX_BIT(/=)
-OP_ASSIGN_MIX_BIT(%=)
-OP_ASSIGN_MIX_BIT(&=)
-OP_ASSIGN_MIX_BIT(|=)
-OP_ASSIGN_MIX_BIT(^=)
-OP_ASSIGN_MIX_BIT(>>=)
-OP_ASSIGN_MIX_BIT(<<=)
-#undef OP_ASSIGN_MIX_BIT
-
-OP_BIN_MIX_BIT(+, plus)
-OP_BIN_MIX_BIT(-, minus)
-OP_BIN_MIX_BIT(*, mult)
-OP_BIN_MIX_BIT(/, div)
-OP_BIN_MIX_BIT(%, mod)
-OP_BIN_MIX_BIT(&, logic)
-OP_BIN_MIX_BIT(|, logic)
-OP_BIN_MIX_BIT(^, logic)
-OP_BIN_MIX_BIT(>>, arg1)
-OP_BIN_MIX_BIT(<<, arg1)
-#undef OP_BIN_MIX_BIT
-
-OP_REL_MIX_BIT(>)
-OP_REL_MIX_BIT(<)
-OP_REL_MIX_BIT(<=)
-OP_REL_MIX_BIT(>=)
-OP_REL_MIX_BIT(==)
-OP_REL_MIX_BIT(!=)
-#undef OP_REL_MIX_BIT
-
-#define REF_REL_OP_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(const _private_range_ref<_AP_W, _AP_S>& op,   \
-                              C_TYPE op2) {                                 \
-    return (ap_private<_AP_W, false>(op))                                   \
-        .                                                                   \
-        operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));                   \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(C_TYPE op2,                                   \
-                              const _private_range_ref<_AP_W, _AP_S>& op) { \
-    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(                 \
-        ap_private<_AP_W, false>(op));                                      \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W, _AP_S>& op,     \
-                              C_TYPE op2) {                                 \
-    return (bool(op))REL_OP op2;                                            \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(C_TYPE op2,                                   \
-                              const _private_bit_ref<_AP_W, _AP_S>& op) {   \
-    return op2 REL_OP(bool(op));                                            \
-  }
-
-#define REF_REL_MIX_INT(C_TYPE, _AP_W2, _AP_S2)      \
-  REF_REL_OP_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_REL_OP_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_REL_OP_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_REL_OP_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_REL_OP_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_REL_OP_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_REL_MIX_INT(bool, 1, false)
-REF_REL_MIX_INT(char, 8, CHAR_IS_SIGNED)
-REF_REL_MIX_INT(signed char, 8, true)
-REF_REL_MIX_INT(unsigned char, 8, false)
-REF_REL_MIX_INT(short, sizeof(short) * 8, true)
-REF_REL_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
-REF_REL_MIX_INT(int, sizeof(int) * 8, true)
-REF_REL_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
-REF_REL_MIX_INT(long, sizeof(long) * 8, true)
-REF_REL_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
-REF_REL_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
-REF_REL_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-#undef REF_REL_OP_MIX_INT
-#undef REF_REL_MIX_INT
-
-#define REF_BIN_OP_MIX_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)              \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
-      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& op,              \
-                      C_TYPE op2) {                                            \
-    return (ap_private<_AP_W, false>(op))                                      \
-        .                                                                      \
-        operator BIN_OP(ap_private<_AP_W2, _AP_S2>(op2));                      \
-  }                                                                            \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W2, _AP_S2>::template RType<_AP_W, false>::RTYPE \
-      operator BIN_OP(C_TYPE op2,                                              \
-                      const _private_range_ref<_AP_W, _AP_S>& op) {            \
-    return ap_private<_AP_W2, _AP_S2>(op2).operator BIN_OP(                    \
-        ap_private<_AP_W, false>(op));                                         \
-  }
-
-#define REF_BIN_MIX_INT(C_TYPE, _AP_W2, _AP_S2)            \
-  REF_BIN_OP_MIX_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_MIX_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_MIX_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
-  REF_BIN_OP_MIX_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))   \
-  REF_BIN_OP_MIX_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_BIN_MIX_INT(bool, 1, false)
-REF_BIN_MIX_INT(char, 8, CHAR_IS_SIGNED)
-REF_BIN_MIX_INT(signed char, 8, true)
-REF_BIN_MIX_INT(unsigned char, 8, false)
-REF_BIN_MIX_INT(short, sizeof(short) * 8, true)
-REF_BIN_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
-REF_BIN_MIX_INT(int, sizeof(int) * 8, true)
-REF_BIN_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
-REF_BIN_MIX_INT(long, sizeof(long) * 8, true)
-REF_BIN_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
-REF_BIN_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
-REF_BIN_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-#undef REF_BIN_OP_MIX_INT
-#undef REF_BIN_MIX_INT
-
-#define REF_BIN_OP(BIN_OP, RTYPE)                                             \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
-  INLINE                                                                      \
-      typename ap_private<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
-      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& lhs,            \
-                      const _private_range_ref<_AP_W2, _AP_S2>& rhs) {        \
-    return ap_private<_AP_W, false>(lhs).operator BIN_OP(                     \
-        ap_private<_AP_W2, false>(rhs));                                      \
-  }
-
-REF_BIN_OP(+, plus)
-REF_BIN_OP(-, minus)
-REF_BIN_OP(*, mult)
-REF_BIN_OP(/, div)
-REF_BIN_OP(%, mod)
-REF_BIN_OP(&, logic)
-REF_BIN_OP(|, logic)
-REF_BIN_OP(^, logic)
-REF_BIN_OP(>>, arg1)
-REF_BIN_OP(<<, arg1)
-#undef REF_BIN_OP
-
-//************************************************************************
-//  Implement
-//      ap_private<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
-//  for operators  +, -, *, /, %, >>, <<, &, |, ^
-//  Without these operators the operands are converted to int64 and
-//  larger results lose informations (higher order bits).
-//
-//                       operand OP
-//                      /          |
-//              left-concat        right-concat
-//                /     |           /         |
-//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>   <RW2,RT2>
-//
-//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
-//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
-//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
-//
-//  In Verilog 2001 result of concatenation is always unsigned even
-//  when both sides are signed.
-//************************************************************************
-
-#endif // ifndef __AP_PRIVATE_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_PRIVATE_H__
+#define __AP_PRIVATE_H__
+
+// common macros and type declarations are now defined in ap_common.h, and
+// ap_private becomes part of it.
+#ifndef __AP_COMMON_H__
+#error "etc/ap_private.h cannot be included directly."
+#endif
+
+// forward declarations
+//template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
+//class ap_private; // moved to ap_common.h
+template <int _AP_W, bool _AP_S>
+struct _private_range_ref;
+template <int _AP_W, bool _AP_S>
+struct _private_bit_ref;
+
+// TODO clean up this part.
+#ifndef LLVM_SUPPORT_MATHEXTRAS_H
+#define LLVM_SUPPORT_MATHEXTRAS_H
+
+#ifdef _MSC_VER
+#if _MSC_VER <= 1500
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#else
+#include <stdint.h>
+#endif
+#else
+#include <stdint.h>
+#endif
+
+#ifndef INLINE
+#define INLINE inline
+// Enable to debug ap_int/ap_fixed
+// #define INLINE  __attribute__((weak))
+#endif
+
+// NOTE: The following support functions use the _32/_64 extensions instead of
+// type overloading so that signed and unsigned integers can be used without
+// ambiguity.
+namespace AESL_std {
+template <class DataType>
+DataType INLINE min(DataType a, DataType b) {
+  return (a >= b) ? b : a;
+}
+
+template <class DataType>
+DataType INLINE max(DataType a, DataType b) {
+  return (a >= b) ? a : b;
+}
+} // namespace AESL_std
+
+// TODO clean up included headers.
+#include <math.h>
+#include <stdio.h>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace ap_private_ops {
+/// Hi_32 - This function returns the high 32 bits of a 64 bit value.
+static INLINE uint32_t Hi_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value >> 32);
+}
+
+/// Lo_32 - This function returns the low 32 bits of a 64 bit value.
+static INLINE uint32_t Lo_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value);
+}
+
+template <int _AP_W>
+INLINE bool isNegative(const ap_private<_AP_W, false>& a) {
+  return false;
+}
+
+template <int _AP_W>
+INLINE bool isNegative(const ap_private<_AP_W, true>& a) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    _AP_N = (_AP_W + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD
+  };
+  static const uint64_t sign_mask = 1ULL << ((_AP_W - 1) % APINT_BITS_PER_WORD);
+  return (sign_mask & a.get_pVal(_AP_N - 1)) != 0;
+}
+
+/// CountLeadingZeros_32 - this function performs the platform optimal form of
+/// counting the number of zeros from the most significant bit to the first one
+/// bit.  Ex. CountLeadingZeros_32(0x00F000FF) == 8.
+/// Returns 32 if the word is zero.
+static INLINE unsigned CountLeadingZeros_32(uint32_t Value) {
+  unsigned Count; // result
+#if __GNUC__ >= 4
+// PowerPC is defined for __builtin_clz(0)
+#if !defined(__ppc__) && !defined(__ppc64__)
+  if (Value == 0) return 32;
+#endif
+  Count = __builtin_clz(Value);
+#else
+  if (Value == 0) return 32;
+  Count = 0;
+  // bisecton method for count leading zeros
+  for (unsigned Shift = 32 >> 1; Shift; Shift >>= 1) {
+    uint32_t Tmp = (Value) >> (Shift);
+    if (Tmp) {
+      Value = Tmp;
+    } else {
+      Count |= Shift;
+    }
+  }
+#endif
+  return Count;
+}
+
+/// CountLeadingZeros_64 - This function performs the platform optimal form
+/// of counting the number of zeros from the most significant bit to the first
+/// one bit (64 bit edition.)
+/// Returns 64 if the word is zero.
+static INLINE unsigned CountLeadingZeros_64(uint64_t Value) {
+  unsigned Count; // result
+#if __GNUC__ >= 4
+// PowerPC is defined for __builtin_clzll(0)
+#if !defined(__ppc__) && !defined(__ppc64__)
+  if (!Value) return 64;
+#endif
+  Count = __builtin_clzll(Value);
+#else
+  if (sizeof(long) == sizeof(int64_t)) {
+    if (!Value) return 64;
+    Count = 0;
+    // bisecton method for count leading zeros
+    for (unsigned Shift = 64 >> 1; Shift; Shift >>= 1) {
+      uint64_t Tmp = (Value) >> (Shift);
+      if (Tmp) {
+        Value = Tmp;
+      } else {
+        Count |= Shift;
+      }
+    }
+  } else {
+    // get hi portion
+    uint32_t Hi = Hi_32(Value);
+
+    // if some bits in hi portion
+    if (Hi) {
+      // leading zeros in hi portion plus all bits in lo portion
+      Count = CountLeadingZeros_32(Hi);
+    } else {
+      // get lo portion
+      uint32_t Lo = Lo_32(Value);
+      // same as 32 bit value
+      Count = CountLeadingZeros_32(Lo) + 32;
+    }
+  }
+#endif
+  return Count;
+}
+
+/// CountTrailingZeros_64 - This function performs the platform optimal form
+/// of counting the number of zeros from the least significant bit to the first
+/// one bit (64 bit edition.)
+/// Returns 64 if the word is zero.
+static INLINE unsigned CountTrailingZeros_64(uint64_t Value) {
+#if __GNUC__ >= 4
+  return (Value != 0) ? __builtin_ctzll(Value) : 64;
+#else
+  static const unsigned Mod67Position[] = {
+      64, 0,  1,  39, 2,  15, 40, 23, 3,  12, 16, 59, 41, 19, 24, 54, 4,
+      64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55, 47, 5,  32,
+      65, 38, 14, 22, 11, 58, 18, 53, 63, 9,  61, 27, 29, 50, 43, 46, 31,
+      37, 21, 57, 52, 8,  26, 49, 45, 36, 56, 7,  48, 35, 6,  34, 33, 0};
+  return Mod67Position[(uint64_t)(-(int64_t)Value & (int64_t)Value) % 67];
+#endif
+}
+
+/// CountPopulation_64 - this function counts the number of set bits in a value,
+/// (64 bit edition.)
+static INLINE unsigned CountPopulation_64(uint64_t Value) {
+#if __GNUC__ >= 4
+  return __builtin_popcountll(Value);
+#else
+  uint64_t v = Value - (((Value) >> 1) & 0x5555555555555555ULL);
+  v = (v & 0x3333333333333333ULL) + (((v) >> 2) & 0x3333333333333333ULL);
+  v = (v + ((v) >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+  return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+#endif
+}
+
+static INLINE uint32_t countLeadingOnes_64(uint64_t __V, uint32_t skip) {
+  uint32_t Count = 0;
+  if (skip) (__V) <<= (skip);
+  while (__V && (__V & (1ULL << 63))) {
+    Count++;
+    (__V) <<= 1;
+  }
+  return Count;
+}
+
+static INLINE std::string oct2Bin(char oct) {
+  switch (oct) {
+    case '\0': {
+      return "";
+    }
+    case '.': {
+      return ".";
+    }
+    case '0': {
+      return "000";
+    }
+    case '1': {
+      return "001";
+    }
+    case '2': {
+      return "010";
+    }
+    case '3': {
+      return "011";
+    }
+    case '4': {
+      return "100";
+    }
+    case '5': {
+      return "101";
+    }
+    case '6': {
+      return "110";
+    }
+    case '7': {
+      return "111";
+    }
+  }
+  assert(0 && "Invalid character in digit string");
+  return "";
+}
+
+static INLINE std::string hex2Bin(char hex) {
+  switch (hex) {
+    case '\0': {
+      return "";
+    }
+    case '.': {
+      return ".";
+    }
+    case '0': {
+      return "0000";
+    }
+    case '1': {
+      return "0001";
+    }
+    case '2': {
+      return "0010";
+    }
+    case '3': {
+      return "0011";
+    }
+    case '4': {
+      return "0100";
+    }
+    case '5': {
+      return "0101";
+    }
+    case '6': {
+      return "0110";
+    }
+    case '7': {
+      return "0111";
+    }
+    case '8': {
+      return "1000";
+    }
+    case '9': {
+      return "1001";
+    }
+    case 'A':
+    case 'a': {
+      return "1010";
+    }
+    case 'B':
+    case 'b': {
+      return "1011";
+    }
+    case 'C':
+    case 'c': {
+      return "1100";
+    }
+    case 'D':
+    case 'd': {
+      return "1101";
+    }
+    case 'E':
+    case 'e': {
+      return "1110";
+    }
+    case 'F':
+    case 'f': {
+      return "1111";
+    }
+  }
+  assert(0 && "Invalid character in digit string");
+  return "";
+}
+
+static INLINE uint32_t decode_digit(char cdigit, int radix) {
+  uint32_t digit = 0;
+  if (radix == 16) {
+#define isxdigit(c)                                            \
+  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
+   ((c) >= 'A' && (c) <= 'F'))
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+    if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
+    if (isdigit(cdigit))
+      digit = cdigit - '0';
+    else if (cdigit >= 'a')
+      digit = cdigit - 'a' + 10;
+    else if (cdigit >= 'A')
+      digit = cdigit - 'A' + 10;
+    else
+      assert(0 && "huh? we shouldn't get here");
+  } else if (isdigit(cdigit)) {
+    digit = cdigit - '0';
+  } else {
+    assert(0 && "Invalid character in digit string");
+  }
+#undef isxdigit
+#undef isdigit
+  return digit;
+}
+
+// Determine the radix of "val".
+static INLINE std::string parseString(const std::string& input, unsigned char& radix) {
+  size_t len = input.length();
+  if (len == 0) {
+    if (radix == 0) radix = 10;
+    return input;
+  }
+
+  size_t startPos = 0;
+  // Trim whitespace
+  while (input[startPos] == ' ' && startPos < len) startPos++;
+  while (input[len - 1] == ' ' && startPos < len) len--;
+
+  std::string val = input.substr(startPos, len - startPos);
+  // std::cout << "val = " << val << "\n";
+  len = val.length();
+  startPos = 0;
+
+  // If the length of the string is less than 2, then radix
+  // is decimal and there is no exponent.
+  if (len < 2) {
+    if (radix == 0) radix = 10;
+    return val;
+  }
+
+  bool isNegative = false;
+  std::string ans;
+
+  // First check to see if we start with a sign indicator
+  if (val[0] == '-') {
+    ans = "-";
+    ++startPos;
+    isNegative = true;
+  } else if (val[0] == '+')
+    ++startPos;
+
+  if (len - startPos < 2) {
+    if (radix == 0) radix = 10;
+    return val;
+  }
+
+  if (val.substr(startPos, 2) == "0x" || val.substr(startPos, 2) == "0X") {
+    // If we start with "0x", then the radix is hex.
+    radix = 16;
+    startPos += 2;
+  } else if (val.substr(startPos, 2) == "0b" ||
+             val.substr(startPos, 2) == "0B") {
+    // If we start with "0b", then the radix is binary.
+    radix = 2;
+    startPos += 2;
+  } else if (val.substr(startPos, 2) == "0o" ||
+             val.substr(startPos, 2) == "0O") {
+    // If we start with "0o", then the radix is octal.
+    radix = 8;
+    startPos += 2;
+  } else if (radix == 0) {
+    radix = 10;
+  }
+
+  int exp = 0;
+  if (radix == 10) {
+    // If radix is decimal, then see if there is an
+    // exponent indicator.
+    size_t expPos = val.find('e');
+    bool has_exponent = true;
+    if (expPos == std::string::npos) expPos = val.find('E');
+    if (expPos == std::string::npos) {
+      // No exponent indicator, so the mantissa goes to the end.
+      expPos = len;
+      has_exponent = false;
+    }
+    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
+
+    ans += val.substr(startPos, expPos - startPos);
+    if (has_exponent) {
+      // Parse the exponent.
+      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
+      iss >> exp;
+    }
+  } else {
+    // Check for a binary exponent indicator.
+    size_t expPos = val.find('p');
+    bool has_exponent = true;
+    if (expPos == std::string::npos) expPos = val.find('P');
+    if (expPos == std::string::npos) {
+      // No exponent indicator, so the mantissa goes to the end.
+      expPos = len;
+      has_exponent = false;
+    }
+
+    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
+
+    assert(startPos <= expPos);
+    // Convert to binary as we go.
+    for (size_t i = startPos; i < expPos; ++i) {
+      if (radix == 16) {
+        ans += hex2Bin(val[i]);
+      } else if (radix == 8) {
+        ans += oct2Bin(val[i]);
+      } else { // radix == 2
+        ans += val[i];
+      }
+    }
+    // End in binary
+    radix = 2;
+    if (has_exponent) {
+      // Parse the exponent.
+      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
+      iss >> exp;
+    }
+  }
+  if (exp == 0) return ans;
+
+  size_t decPos = ans.find('.');
+  if (decPos == std::string::npos) decPos = ans.length();
+  if ((int)decPos + exp >= (int)ans.length()) {
+    int i = decPos;
+    for (; i < (int)ans.length() - 1; ++i) ans[i] = ans[i + 1];
+    for (; i < (int)ans.length(); ++i) ans[i] = '0';
+    for (; i < (int)decPos + exp; ++i) ans += '0';
+    return ans;
+  } else if ((int)decPos + exp < (int)isNegative) {
+    std::string dupAns = "0.";
+    if (ans[0] == '-') dupAns = "-0.";
+    for (int i = 0; i < isNegative - (int)decPos - exp; ++i) dupAns += '0';
+    for (size_t i = isNegative; i < ans.length(); ++i)
+      if (ans[i] != '.') dupAns += ans[i];
+    return dupAns;
+  }
+
+  if (exp > 0)
+    for (size_t i = decPos; i < decPos + exp; ++i) ans[i] = ans[i + 1];
+  else {
+    if (decPos == ans.length()) ans += ' ';
+    for (int i = decPos; i > (int)decPos + exp; --i) ans[i] = ans[i - 1];
+  }
+  ans[decPos + exp] = '.';
+  return ans;
+}
+
+/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from
+/// the multi-digit integer array, x[], propagating the borrowed 1 value until
+/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
+/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
+/// In other words, if y > x then this function returns 1, otherwise 0.
+/// @returns the borrow out of the subtraction
+static INLINE bool sub_1(uint64_t x[], uint32_t len, uint64_t y) {
+  for (uint32_t i = 0; i < len; ++i) {
+    uint64_t __X = x[i];
+    x[i] -= y;
+    if (y > __X)
+      y = 1; // We have to "borrow 1" from next "digit"
+    else {
+      y = 0; // No need to borrow
+      break; // Remaining digits are unchanged so exit early
+    }
+  }
+  return (y != 0);
+}
+
+/// add_1 - This function adds a single "digit" integer, y, to the multiple
+/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
+/// 1 is returned if there is a carry out, otherwise 0 is returned.
+/// @returns the carry of the addition.
+static INLINE bool add_1(uint64_t dest[], uint64_t x[], uint32_t len,
+                         uint64_t y) {
+  for (uint32_t i = 0; i < len; ++i) {
+    dest[i] = y + x[i];
+    if (dest[i] < y)
+      y = 1; // Carry one to next digit.
+    else {
+      y = 0; // No need to carry so exit early
+      break;
+    }
+  }
+  return (y != 0);
+}
+
+/// add - This function adds the integer array x to the integer array Y and
+/// places the result in dest.
+/// @returns the carry out from the addition
+/// @brief General addition of 64-bit integer arrays
+static INLINE bool add(uint64_t* dest, const uint64_t* x, const uint64_t* y,
+                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
+                       bool xsigned, bool ysigned) {
+  bool carry = false;
+  uint32_t len = AESL_std::min(xlen, ylen);
+  uint32_t i;
+  for (i = 0; i < len && i < destlen; ++i) {
+    uint64_t limit =
+        AESL_std::min(x[i], y[i]); // must come first in case dest == x
+    dest[i] = x[i] + y[i] + carry;
+    carry = dest[i] < limit || (carry && dest[i] == limit);
+  }
+  if (xlen > ylen) {
+    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
+    for (i = ylen; i < xlen && i < destlen; i++) {
+      uint64_t limit = AESL_std::min(x[i], yext);
+      dest[i] = x[i] + yext + carry;
+      carry = (dest[i] < limit) || (carry && dest[i] == limit);
+    }
+  } else if (ylen > xlen) {
+    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
+    for (i = xlen; i < ylen && i < destlen; i++) {
+      uint64_t limit = AESL_std::min(xext, y[i]);
+      dest[i] = xext + y[i] + carry;
+      carry = (dest[i] < limit) || (carry && dest[i] == limit);
+    }
+  }
+  return carry;
+}
+
+/// @returns returns the borrow out.
+/// @brief Generalized subtraction of 64-bit integer arrays.
+static INLINE bool sub(uint64_t* dest, const uint64_t* x, const uint64_t* y,
+                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
+                       bool xsigned, bool ysigned) {
+  bool borrow = false;
+  uint32_t i;
+  uint32_t len = AESL_std::min(xlen, ylen);
+  for (i = 0; i < len && i < destlen; ++i) {
+    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
+    dest[i] = x_tmp - y[i];
+  }
+  if (xlen > ylen) {
+    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
+    for (i = ylen; i < xlen && i < destlen; i++) {
+      uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+      borrow = yext > x_tmp || (borrow && x[i] == 0);
+      dest[i] = x_tmp - yext;
+    }
+  } else if (ylen > xlen) {
+    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
+    for (i = xlen; i < ylen && i < destlen; i++) {
+      uint64_t x_tmp = borrow ? xext - 1 : xext;
+      borrow = y[i] > x_tmp || (borrow && xext == 0);
+      dest[i] = x_tmp - y[i];
+    }
+  }
+  return borrow;
+}
+
+/// Subtracts the RHS ap_private from this ap_private
+/// @returns this, after subtraction
+/// @brief Subtraction assignment operator.
+
+/// Multiplies an integer array, x by a a uint64_t integer and places the result
+/// into dest.
+/// @returns the carry out of the multiplication.
+/// @brief Multiply a multi-digit ap_private by a single digit (64-bit) integer.
+static INLINE uint64_t mul_1(uint64_t dest[], const uint64_t x[], uint32_t len,
+                             uint64_t y) {
+  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
+  uint64_t ly = y & 0xffffffffULL, hy = (y) >> 32;
+  uint64_t carry = 0;
+  static const uint64_t two_power_32 = 1ULL << 32;
+  // For each digit of x.
+  for (uint32_t i = 0; i < len; ++i) {
+    // Split x into high and low words
+    uint64_t lx = x[i] & 0xffffffffULL;
+    uint64_t hx = (x[i]) >> 32;
+    // hasCarry - A flag to indicate if there is a carry to the next digit.
+    // hasCarry == 0, no carry
+    // hasCarry == 1, has carry
+    // hasCarry == 2, no carry and the calculation result == 0.
+    uint8_t hasCarry = 0;
+    dest[i] = carry + lx * ly;
+    // Determine if the add above introduces carry.
+    hasCarry = (dest[i] < carry) ? 1 : 0;
+    carry = hx * ly + ((dest[i]) >> 32) + (hasCarry ? two_power_32 : 0);
+    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
+    // (2^32 - 1) + 2^32 = 2^64.
+    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+
+    carry += (lx * hy) & 0xffffffffULL;
+    dest[i] = ((carry) << 32) | (dest[i] & 0xffffffffULL);
+    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? two_power_32 : 0) +
+            ((carry) >> 32) + ((lx * hy) >> 32) + hx * hy;
+  }
+  return carry;
+}
+
+/// Multiplies integer array x by integer array y and stores the result into
+/// the integer array dest. Note that dest's size must be >= xlen + ylen in
+/// order to
+/// do a full precision computation. If it is not, then only the low-order words
+/// are returned.
+/// @brief Generalized multiplicate of integer arrays.
+static INLINE void mul(uint64_t dest[], const uint64_t x[], uint32_t xlen,
+                       const uint64_t y[], uint32_t ylen, uint32_t destlen) {
+  assert(xlen > 0);
+  assert(ylen > 0);
+  assert(destlen >= xlen + ylen);
+  if (xlen < destlen) dest[xlen] = mul_1(dest, x, xlen, y[0]);
+  for (uint32_t i = 1; i < ylen; ++i) {
+    uint64_t ly = y[i] & 0xffffffffULL, hy = (y[i]) >> 32;
+    uint64_t carry = 0, lx = 0, hx = 0;
+    for (uint32_t j = 0; j < xlen; ++j) {
+      lx = x[j] & 0xffffffffULL;
+      hx = (x[j]) >> 32;
+      // hasCarry - A flag to indicate if has carry.
+      // hasCarry == 0, no carry
+      // hasCarry == 1, has carry
+      // hasCarry == 2, no carry and the calculation result == 0.
+      uint8_t hasCarry = 0;
+      uint64_t resul = carry + lx * ly;
+      hasCarry = (resul < carry) ? 1 : 0;
+      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + ((resul) >> 32);
+      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+      carry += (lx * hy) & 0xffffffffULL;
+      resul = ((carry) << 32) | (resul & 0xffffffffULL);
+      if (i + j < destlen) dest[i + j] += resul;
+      carry =
+          (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
+          ((carry) >> 32) + (dest[i + j] < resul ? 1 : 0) + ((lx * hy) >> 32) +
+          hx * hy;
+    }
+    if (i + xlen < destlen) dest[i + xlen] = carry;
+  }
+}
+
+/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
+/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
+/// variables here have the same names as in the algorithm. Comments explain
+/// the algorithm and any deviation from it.
+static INLINE void KnuthDiv(uint32_t* u, uint32_t* v, uint32_t* q, uint32_t* r,
+                            uint32_t m, uint32_t n) {
+  assert(u && "Must provide dividend");
+  assert(v && "Must provide divisor");
+  assert(q && "Must provide quotient");
+  assert(u != v && u != q && v != q && "Must us different memory");
+  assert(n > 1 && "n must be > 1");
+
+  // Knuth uses the value b as the base of the number system. In our case b
+  // is 2^31 so we just set it to -1u.
+  uint64_t b = uint64_t(1) << 32;
+
+  // DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n');
+  // DEBUG(cerr << "KnuthDiv: original:");
+  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
+  // u[i]);
+  // DEBUG(cerr << " by");
+  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
+  // v[i-1]);
+  // DEBUG(cerr << '\n');
+  // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
+  // u and v by d. Note that we have taken Knuth's advice here to use a power
+  // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
+  // 2 allows us to shift instead of multiply and it is easy to determine the
+  // shift amount from the leading zeros.  We are basically normalizing the u
+  // and v so that its high bits are shifted to the top of v's range without
+  // overflow. Note that this can require an extra word in u so that u must
+  // be of length m+n+1.
+  uint32_t shift = CountLeadingZeros_32(v[n - 1]);
+  uint32_t v_carry = 0;
+  uint32_t u_carry = 0;
+  if (shift) {
+    for (uint32_t i = 0; i < m + n; ++i) {
+      uint32_t u_tmp = (u[i]) >> (32 - shift);
+      u[i] = ((u[i]) << (shift)) | u_carry;
+      u_carry = u_tmp;
+    }
+    for (uint32_t i = 0; i < n; ++i) {
+      uint32_t v_tmp = (v[i]) >> (32 - shift);
+      v[i] = ((v[i]) << (shift)) | v_carry;
+      v_carry = v_tmp;
+    }
+  }
+  u[m + n] = u_carry;
+  // DEBUG(cerr << "KnuthDiv:   normal:");
+  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
+  // u[i]);
+  // DEBUG(cerr << " by");
+  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
+  // v[i-1]);
+  // DEBUG(cerr << '\n');
+
+  // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
+  int j = m;
+  do {
+    // DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n');
+    // D3. [Calculate q'.].
+    //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
+    //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
+    // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
+    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // on v[n-2] determines at high speed most of the cases in which the trial
+    // value qp is one too large, and it eliminates all cases where qp is two
+    // too large.
+    uint64_t dividend = ((uint64_t(u[j + n]) << 32) + u[j + n - 1]);
+    // DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n');
+    uint64_t qp = dividend / v[n - 1];
+    uint64_t rp = dividend % v[n - 1];
+    if (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2]) {
+      qp--;
+      rp += v[n - 1];
+      if (rp < b && (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2])) qp--;
+    }
+    // DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
+
+    // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
+    // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
+    // consists of a simple multiplication by a one-place number, combined with
+    // a subtraction.
+    bool isNeg = false;
+    for (uint32_t i = 0; i < n; ++i) {
+      uint64_t u_tmp = uint64_t(u[j + i]) | ((uint64_t(u[j + i + 1])) << 32);
+      uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
+      bool borrow = subtrahend > u_tmp;
+      /*DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp
+        << ", subtrahend == " << subtrahend
+        << ", borrow = " << borrow << '\n');*/
+
+      uint64_t result = u_tmp - subtrahend;
+      uint32_t k = j + i;
+      u[k++] = (uint32_t)(result & (b - 1)); // subtract low word
+      u[k++] = (uint32_t)((result) >> 32);   // subtract high word
+      while (borrow && k <= m + n) {         // deal with borrow to the left
+        borrow = u[k] == 0;
+        u[k]--;
+        k++;
+      }
+      isNeg |= borrow;
+      /*DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ",  u[j+i+1] == " <<
+        u[j+i+1] << '\n');*/
+    }
+    /*DEBUG(cerr << "KnuthDiv: after subtraction:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+      DEBUG(cerr << '\n');*/
+    // The digits (u[j+n]...u[j]) should be kept positive; if the result of
+    // this step is actually negative, (u[j+n]...u[j]) should be left as the
+    // true value plus b**(n+1), namely as the b's complement of
+    // the true value, and a "borrow" to the left should be remembered.
+    //
+    if (isNeg) {
+      bool carry = true; // true because b's complement is "complement + 1"
+      for (uint32_t i = 0; i <= m + n; ++i) {
+        u[i] = ~u[i] + carry; // b's complement
+        carry = carry && u[i] == 0;
+      }
+    }
+    /*DEBUG(cerr << "KnuthDiv: after complement:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+      DEBUG(cerr << '\n');*/
+
+    // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
+    // negative, go to step D6; otherwise go on to step D7.
+    q[j] = (uint32_t)qp;
+    if (isNeg) {
+      // D6. [Add back]. The probability that this step is necessary is very
+      // small, on the order of only 2/b. Make sure that test data accounts for
+      // this possibility. Decrease q[j] by 1
+      q[j]--;
+      // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]).
+      // A carry will occur to the left of u[j+n], and it should be ignored
+      // since it cancels with the borrow that occurred in D4.
+      bool carry = false;
+      for (uint32_t i = 0; i < n; i++) {
+        uint32_t limit = AESL_std::min(u[j + i], v[i]);
+        u[j + i] += v[i] + carry;
+        carry = u[j + i] < limit || (carry && u[j + i] == limit);
+      }
+      u[j + n] += carry;
+    }
+    /*DEBUG(cerr << "KnuthDiv: after correction:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]);
+      DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');*/
+
+    // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
+  } while (--j >= 0);
+
+  /*DEBUG(cerr << "KnuthDiv: quotient:");
+    DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]);
+    DEBUG(cerr << '\n');*/
+
+  // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
+  // remainder may be obtained by dividing u[...] by d. If r is non-null we
+  // compute the remainder (urem uses this).
+  if (r) {
+    // The value d is expressed by the "shift" value above since we avoided
+    // multiplication by d by using a shift left. So, all we have to do is
+    // shift right here. In order to mak
+    if (shift) {
+      uint32_t carry = 0;
+      // DEBUG(cerr << "KnuthDiv: remainder:");
+      for (int i = n - 1; i >= 0; i--) {
+        r[i] = ((u[i]) >> (shift)) | carry;
+        carry = (u[i]) << (32 - shift);
+        // DEBUG(cerr << " " << r[i]);
+      }
+    } else {
+      for (int i = n - 1; i >= 0; i--) {
+        r[i] = u[i];
+        // DEBUG(cerr << " " << r[i]);
+      }
+    }
+    // DEBUG(cerr << '\n');
+  }
+  // DEBUG(cerr << std::setbase(10) << '\n');
+}
+
+template <int _AP_W, bool _AP_S>
+void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
+            const ap_private<_AP_W, _AP_S>& RHS, uint32_t rhsWords,
+            ap_private<_AP_W, _AP_S>* Quotient,
+            ap_private<_AP_W, _AP_S>* Remainder) {
+  assert(lhsWords >= rhsWords && "Fractional result");
+  enum { APINT_BITS_PER_WORD = 64 };
+  // First, compose the values into an array of 32-bit words instead of
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the the Knuth "classical algorithm" which requires there to be native
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We
+  // can't use 64-bit operands here because we don't have native results of
+  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
+  uint32_t n = rhsWords * 2;
+  uint32_t m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  uint32_t SPACE[128];
+  uint32_t* __U = 0;
+  uint32_t* __V = 0;
+  uint32_t* __Q = 0;
+  uint32_t* __R = 0;
+  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
+    __U = &SPACE[0];
+    __V = &SPACE[m + n + 1];
+    __Q = &SPACE[(m + n + 1) + n];
+    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
+  } else {
+    __U = new uint32_t[m + n + 1];
+    __V = new uint32_t[n];
+    __Q = new uint32_t[m + n];
+    if (Remainder) __R = new uint32_t[n];
+  }
+
+  // Initialize the dividend
+  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = LHS.get_pVal(i);
+    __U[i * 2] = (uint32_t)(tmp & mask);
+    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(__V, 0, (n) * sizeof(uint32_t));
+  for (unsigned i = 0; i < rhsWords; ++i) {
+    uint64_t tmp = RHS.get_pVal(i);
+    __V[i * 2] = (uint32_t)(tmp & mask);
+    __V[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+
+  // initialize the quotient and remainder
+  memset(__Q, 0, (m + n) * sizeof(uint32_t));
+  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    uint32_t divisor = __V[0];
+    uint32_t remainder = 0;
+    for (int i = m + n - 1; i >= 0; i--) {
+      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
+      if (partial_dividend == 0) {
+        __Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        __Q[i] = 0;
+        remainder = (uint32_t)partial_dividend;
+      } else if (partial_dividend == divisor) {
+        __Q[i] = 1;
+        remainder = 0;
+      } else {
+        __Q[i] = (uint32_t)(partial_dividend / divisor);
+        remainder = (uint32_t)(partial_dividend - (__Q[i] * divisor));
+      }
+    }
+    if (__R) __R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(__U, __V, __Q, __R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
+    } else
+      Quotient->clear();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
+      Quotient->set_VAL(tmp);
+    } else {
+      assert(!Quotient->isSingleWord() &&
+             "Quotient ap_private not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->set_pVal(
+            i, uint64_t(__Q[i * 2]) |
+                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Quotient->clearUnusedBits();
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != RHS.BitWidth) {
+      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
+    } else
+      Remainder->clear();
+
+    // The remainder is in R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
+      Remainder->set_VAL(tmp);
+    } else {
+      assert(!Remainder->isSingleWord() &&
+             "Remainder ap_private not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->set_pVal(
+            i, uint64_t(__R[i * 2]) |
+                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Remainder->clearUnusedBits();
+  }
+
+  // Clean up the memory we allocated.
+  if (__U != &SPACE[0]) {
+    delete[] __U;
+    delete[] __V;
+    delete[] __Q;
+    delete[] __R;
+  }
+}
+
+template <int _AP_W, bool _AP_S>
+void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
+            uint64_t RHS, ap_private<_AP_W, _AP_S>* Quotient,
+            ap_private<_AP_W, _AP_S>* Remainder) {
+  uint32_t rhsWords = 1;
+  assert(lhsWords >= rhsWords && "Fractional result");
+  enum { APINT_BITS_PER_WORD = 64 };
+  // First, compose the values into an array of 32-bit words instead of
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the the Knuth "classical algorithm" which requires there to be native
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We
+  // can't use 64-bit operands here because we don't have native results of
+  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
+  uint32_t n = 2;
+  uint32_t m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  uint32_t SPACE[128];
+  uint32_t* __U = 0;
+  uint32_t* __V = 0;
+  uint32_t* __Q = 0;
+  uint32_t* __R = 0;
+  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
+    __U = &SPACE[0];
+    __V = &SPACE[m + n + 1];
+    __Q = &SPACE[(m + n + 1) + n];
+    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
+  } else {
+    __U = new uint32_t[m + n + 1];
+    __V = new uint32_t[n];
+    __Q = new uint32_t[m + n];
+    if (Remainder) __R = new uint32_t[n];
+  }
+
+  // Initialize the dividend
+  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = LHS.get_pVal(i);
+    __U[i * 2] = tmp & mask;
+    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(__V, 0, (n) * sizeof(uint32_t));
+  __V[0] = RHS & mask;
+  __V[1] = (RHS) >> (sizeof(uint32_t) * 8);
+
+  // initialize the quotient and remainder
+  memset(__Q, 0, (m + n) * sizeof(uint32_t));
+  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    uint32_t divisor = __V[0];
+    uint32_t remainder = 0;
+    for (int i = m + n - 1; i >= 0; i--) {
+      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
+      if (partial_dividend == 0) {
+        __Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        __Q[i] = 0;
+        remainder = partial_dividend;
+      } else if (partial_dividend == divisor) {
+        __Q[i] = 1;
+        remainder = 0;
+      } else {
+        __Q[i] = partial_dividend / divisor;
+        remainder = partial_dividend - (__Q[i] * divisor);
+      }
+    }
+    if (__R) __R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(__U, __V, __Q, __R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
+    } else
+      Quotient->clear();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
+      Quotient->set_VAL(tmp);
+    } else {
+      assert(!Quotient->isSingleWord() &&
+             "Quotient ap_private not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->set_pVal(
+            i, uint64_t(__Q[i * 2]) |
+                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Quotient->clearUnusedBits();
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != 64 /* RHS.BitWidth */) {
+      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
+    } else
+      Remainder->clear();
+
+    // The remainder is in __R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
+      Remainder->set_VAL(tmp);
+    } else {
+      assert(!Remainder->isSingleWord() &&
+             "Remainder ap_private not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->set_pVal(
+            i, uint64_t(__R[i * 2]) |
+                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Remainder->clearUnusedBits();
+  }
+
+  // Clean up the memory we allocated.
+  if (__U != &SPACE[0]) {
+    delete[] __U;
+    delete[] __V;
+    delete[] __Q;
+    delete[] __R;
+  }
+}
+
+/// @brief Logical right-shift function.
+template <int _AP_W, bool _AP_S, bool _AP_C>
+INLINE ap_private<_AP_W, _AP_S, _AP_C> lshr(
+    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
+  return LHS.lshr(shiftAmt);
+}
+
+/// Left-shift the ap_private by shiftAmt.
+/// @brief Left-shift function.
+template <int _AP_W, bool _AP_S, bool _AP_C>
+INLINE ap_private<_AP_W, _AP_S, _AP_C> shl(
+    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
+  return LHS.shl(shiftAmt);
+}
+
+} // namespace ap_private_ops
+
+#endif // LLVM_SUPPORT_MATHEXTRAS_H
+
+/// This enumeration just provides for internal constants used in this
+/// translation unit.
+enum {
+  MIN_INT_BITS = 1, ///< Minimum number of bits that can be specified
+  ///< Note that this must remain synchronized with IntegerType::MIN_INT_BITS
+  MAX_INT_BITS = (1 << 23) - 1 ///< Maximum number of bits that can be specified
+  ///< Note that this must remain synchronized with IntegerType::MAX_INT_BITS
+};
+
+//===----------------------------------------------------------------------===//
+//                              ap_private Class
+//===----------------------------------------------------------------------===//
+
+/// ap_private - This class represents arbitrary precision constant integral
+/// values.
+/// It is a functional replacement for common case unsigned integer type like
+/// "unsigned", "unsigned long" or "uint64_t", but also allows non-byte-width
+/// integer sizes and large integer value types such as 3-bits, 15-bits, or more
+/// than 64-bits of precision. ap_private provides a variety of arithmetic
+/// operators
+/// and methods to manipulate integer values of any bit-width. It supports both
+/// the typical integer arithmetic and comparison operations as well as bitwise
+/// manipulation.
+///
+/// The class has several invariants worth noting:
+///   * All bit, byte, and word positions are zero-based.
+///   * Once the bit width is set, it doesn't change except by the Truncate,
+///     SignExtend, or ZeroExtend operations.
+///   * All binary operators must be on ap_private instances of the same bit
+///   width.
+///     Attempting to use these operators on instances with different bit
+///     widths will yield an assertion.
+///   * The value is stored canonically as an unsigned value. For operations
+///     where it makes a difference, there are both signed and unsigned variants
+///     of the operation. For example, sdiv and udiv. However, because the bit
+///     widths must be the same, operations such as Mul and Add produce the same
+///     results regardless of whether the values are interpreted as signed or
+///     not.
+///   * In general, the class tries to follow the style of computation that LLVM
+///     uses in its IR. This simplifies its use for LLVM.
+///
+/// @brief Class for arbitrary precision integers.
+
+#if defined(_MSC_VER)
+#if _MSC_VER < 1400 && !defined(for)
+#define for if (0); else for
+#endif
+typedef unsigned __int64 ap_ulong;
+typedef signed __int64 ap_slong;
+#else
+typedef unsigned long long ap_ulong;
+typedef signed long long ap_slong;
+#endif
+template <int _AP_N8, bool _AP_S>
+struct valtype;
+
+template <int _AP_N8>
+struct valtype<_AP_N8, false> {
+  typedef uint64_t Type;
+};
+
+template <int _AP_N8>
+struct valtype<_AP_N8, true> {
+  typedef int64_t Type;
+};
+
+template <>
+struct valtype<1, false> {
+  typedef unsigned char Type;
+};
+template <>
+struct valtype<2, false> {
+  typedef unsigned short Type;
+};
+template <>
+struct valtype<3, false> {
+  typedef unsigned int Type;
+};
+template <>
+struct valtype<4, false> {
+  typedef unsigned int Type;
+};
+template <>
+struct valtype<1, true> {
+  typedef signed char Type;
+};
+template <>
+struct valtype<2, true> {
+  typedef short Type;
+};
+template <>
+struct valtype<3, true> {
+  typedef int Type;
+};
+template <>
+struct valtype<4, true> {
+  typedef int Type;
+};
+
+template <bool enable>
+struct ap_private_enable_if {};
+template <>
+struct ap_private_enable_if<true> {
+  static const bool isValid = true;
+};
+
+// When bitwidth < 64
+template <int _AP_W, bool _AP_S>
+class ap_private<_AP_W, _AP_S, true> {
+  // SFINAE pattern.  Only consider this class when _AP_W <= 64
+  const static bool valid = ap_private_enable_if<_AP_W <= 64>::isValid;
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+ public:
+  typedef typename valtype<(_AP_W + 7) / 8, _AP_S>::Type ValType;
+  typedef ap_private<_AP_W, _AP_S> Type;
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+    typedef ap_private<mult_w, mult_s> mult;
+    typedef ap_private<plus_w, plus_s> plus;
+    typedef ap_private<minus_w, minus_s> minus;
+    typedef ap_private<logic_w, logic_s> logic;
+    typedef ap_private<div_w, div_s> div;
+    typedef ap_private<mod_w, mod_s> mod;
+    typedef ap_private<_AP_W, _AP_S> arg1;
+    typedef bool reduce;
+  };
+  enum { APINT_BITS_PER_WORD = sizeof(uint64_t) * 8 };
+  enum {
+    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
+                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
+                      : 0
+  };
+  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
+  static const uint64_t not_mask = ~mask;
+  static const uint64_t sign_bit_mask = 1ULL << (APINT_BITS_PER_WORD - 1);
+  template <int _AP_W1>
+  struct sign_ext_mask {
+    static const uint64_t mask = ~0ULL << _AP_W1;
+  };
+  static const int width = _AP_W;
+
+  enum {
+    BitWidth = _AP_W,
+    _AP_N = 1,
+  };
+  ValType VAL; ///< Used to store the <= 64 bits integer value.
+#ifdef AP_CANARY
+  ValType CANARY;
+  void check_canary() { assert(CANARY == (ValType)0xDEADBEEFDEADBEEF); }
+  void set_canary() { CANARY = (ValType)0xDEADBEEFDEADBEEF; }
+#else
+  void check_canary() {}
+  void set_canary() {}
+#endif
+
+  INLINE ValType& get_VAL(void) { return VAL; }
+  INLINE ValType get_VAL(void) const { return VAL; }
+  INLINE ValType get_VAL(void) const volatile { return VAL; }
+  INLINE void set_VAL(uint64_t value) { VAL = (ValType)value; }
+  INLINE ValType& get_pVal(int i) { return VAL; }
+  INLINE ValType get_pVal(int i) const { return VAL; }
+  INLINE const uint64_t* get_pVal() const {
+    assert(0 && "invalid usage");
+    return 0;
+  }
+  INLINE ValType get_pVal(int i) const volatile { return VAL; }
+  INLINE uint64_t* get_pVal() const volatile {
+    assert(0 && "invalid usage");
+    return 0;
+  }
+  INLINE void set_pVal(int i, uint64_t value) { VAL = (ValType)value; }
+
+  INLINE uint32_t getBitWidth() const { return BitWidth; }
+
+  template <int _AP_W1, bool _AP_S1>
+  ap_private<_AP_W, _AP_S>& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  ap_private<_AP_W, _AP_S>& operator=(
+      const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(RHS.get_VAL()); // TODO check here about ap_private<W,S,false>
+    clearUnusedBits();
+    return *this;
+  }
+
+  void operator=(const ap_private& RHS) volatile {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+  }
+
+  ap_private& operator=(const ap_private& RHS) {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+    return *this;
+  }
+
+  void operator=(const volatile ap_private& RHS) volatile {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+  }
+
+  ap_private& operator=(const volatile ap_private& RHS) {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    *this = ap_private<_AP_W2, false>(op2);
+    return *this;
+  }
+
+#define ASSIGN_OP_FROM_INT(C_TYPE)               \
+  INLINE ap_private& operator=(const C_TYPE v) { \
+    set_canary();                                \
+    this->VAL = (ValType)v;                      \
+    clearUnusedBits();                           \
+    check_canary();                              \
+    return *this;                                \
+  }
+
+ASSIGN_OP_FROM_INT(bool)
+ASSIGN_OP_FROM_INT(char)
+ASSIGN_OP_FROM_INT(signed char)
+ASSIGN_OP_FROM_INT(unsigned char)
+ASSIGN_OP_FROM_INT(short)
+ASSIGN_OP_FROM_INT(unsigned short)
+ASSIGN_OP_FROM_INT(int)
+ASSIGN_OP_FROM_INT(unsigned int)
+ASSIGN_OP_FROM_INT(long)
+ASSIGN_OP_FROM_INT(unsigned long)
+ASSIGN_OP_FROM_INT(ap_slong)
+ASSIGN_OP_FROM_INT(ap_ulong)
+#if 0
+ASSIGN_OP_FROM_INT(half)
+ASSIGN_OP_FROM_INT(float)
+ASSIGN_OP_FROM_INT(double)
+#endif
+#undef ASSIGN_OP_FROM_INT
+
+  // XXX This is a must to prevent pointer being converted to bool.
+  INLINE ap_private& operator=(const char* s) {
+    ap_private tmp(s); // XXX direct-initialization, as ctor is explicit.
+    operator=(tmp);
+    return *this;
+  }
+
+ private:
+  explicit INLINE ap_private(uint64_t* val) : VAL(val[0]) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  INLINE bool isSingleWord() const { return true; }
+
+ public:
+  INLINE void fromString(const char* strStart, uint32_t slen, uint8_t radix) {
+    bool isNeg = strStart[0] == '-';
+    if (isNeg) {
+      strStart++;
+      slen--;
+    }
+
+    if (strStart[0] == '0' && (strStart[1] == 'b' || strStart[1] == 'B')) {
+      //if(radix == 0) radix = 2;
+      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", strStart, 2, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'o' || strStart[1] == 'O')) {
+      //if (radix == 0) radix = 8;
+      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", strStart, 8, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'x' || strStart[1] == 'X')) {
+      //if (radix == 0) radix = 16;
+      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", strStart, 16, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'd' || strStart[1] == 'D')) {
+      //if (radix == 0) radix = 10;
+      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", strStart, 10, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (radix == 0) {
+      //radix = 2; // XXX default value
+    }
+
+    // Check our assumptions here
+    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+           "Radix should be 2, 8, 10, or 16!");
+    assert(strStart && "String is null?");
+
+    // Clear bits.
+    uint64_t tmpVAL = VAL = 0;
+
+    switch (radix) {
+      case 2:
+        //        sscanf(strStart,"%b",&VAL);
+        // tmpVAL = *strStart =='1' ? ~0ULL : 0;
+        for (; *strStart; ++strStart) {
+          assert((*strStart == '0' || *strStart == '1') &&
+                 ("Wrong binary number"));
+          tmpVAL <<= 1;
+          tmpVAL |= (*strStart - '0');
+        }
+        break;
+      case 8:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llo", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lo", &tmpVAL);
+#else
+        sscanf(strStart, "%llo", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      case 10:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llu", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lu", &tmpVAL);
+#else
+        sscanf(strStart, "%llu", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      case 16:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llx", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lx", &tmpVAL);
+#else
+        sscanf(strStart, "%llx", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      default:
+        assert(true && "Unknown radix");
+        // error
+    }
+    VAL = isNeg ? (ValType)(-tmpVAL) : (ValType)(tmpVAL);
+
+    clearUnusedBits();
+  }
+
+ private:
+  INLINE ap_private(const std::string& val, uint8_t radix = 2) : VAL(0) {
+    assert(!val.empty() && "String empty?");
+    set_canary();
+    fromString(val.c_str(), val.size(), radix);
+    check_canary();
+  }
+
+  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix)
+      : VAL(0) {
+    set_canary();
+    fromString(strStart, slen, radix);
+    check_canary();
+  }
+
+  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[])
+      : VAL(bigVal[0]) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+ public:
+  INLINE ap_private() {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+#define CTOR(TYPE)                              \
+  INLINE ap_private(TYPE v) : VAL((ValType)v) { \
+    set_canary();                               \
+    clearUnusedBits();                          \
+    check_canary();                             \
+  }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if 0
+  CTOR(half)
+  CTOR(float)
+  CTOR(double)
+#endif
+#undef CTOR
+
+  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
+      : VAL((ValType)that.get_VAL()) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
+      : VAL((ValType)that.get_VAL()) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  explicit INLINE ap_private(const char* val) {
+    set_canary();
+    unsigned char radix = 10;
+    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
+    std::string::size_type pos = str.find('.');
+    // trunc all fraction part
+    if (pos != std::string::npos) str = str.substr(pos);
+
+    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
+    operator=(ap_private_val);
+    check_canary();
+  }
+
+  INLINE ap_private(const char* val, signed char rd) {
+    set_canary();
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
+    std::string::size_type pos = str.find('.');
+    // trunc all fraction part
+    if (pos != std::string::npos) str = str.substr(pos);
+
+    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
+    operator=(ap_private_val);
+    check_canary();
+  }
+
+  INLINE ~ap_private() { check_canary(); }
+
+  INLINE bool isNegative() const {
+    static const uint64_t sign_mask = 1ULL << (_AP_W - 1);
+    return _AP_S && (sign_mask & VAL);
+  }
+
+  INLINE bool isPositive() const { return !isNegative(); }
+
+  INLINE bool isStrictlyPositive() const { return !isNegative() && VAL != 0; }
+
+  INLINE bool isAllOnesValue() const { return (mask & VAL) == mask; }
+
+  INLINE bool operator==(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return VAL == RHS.get_VAL();
+  }
+  INLINE bool operator==(const ap_private<_AP_W, !_AP_S>& RHS) const {
+    return (uint64_t)VAL == (uint64_t)RHS.get_VAL();
+  }
+
+  INLINE bool operator==(uint64_t Val) const { return ((uint64_t)VAL == Val); }
+  INLINE bool operator!=(uint64_t Val) const { return ((uint64_t)VAL != Val); }
+  INLINE bool operator!=(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return VAL != RHS.get_VAL();
+  }
+  INLINE bool operator!=(const ap_private<_AP_W, !_AP_S>& RHS) const {
+    return (uint64_t)VAL != (uint64_t)RHS.get_VAL();
+  }
+
+  /// postfix increment.
+  const ap_private operator++(int) {
+    ap_private orig(*this);
+    VAL++;
+    clearUnusedBits();
+    return orig;
+  }
+
+  /// prefix increment.
+  const ap_private operator++() {
+    ++VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// postfix decrement.
+  const ap_private operator--(int) {
+    ap_private orig(*this);
+    --VAL;
+    clearUnusedBits();
+    return orig;
+  }
+
+  /// prefix decrement.
+  const ap_private operator--() {
+    --VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// one's complement.
+  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
+    ap_private<_AP_W + !_AP_S, true> Result(*this);
+    Result.flip();
+    return Result;
+  }
+
+  /// two's complement.
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_private<1, false>(0) - (*this);
+  }
+
+  /// logic negation.
+  INLINE bool operator!() const { return !VAL; }
+
+  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
+  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
+    return toString(radix, false);
+  }
+  INLINE std::string toStringSigned(uint8_t radix = 10) const {
+    return toString(radix, true);
+  }
+  INLINE void clear() { VAL = 0; }
+  INLINE ap_private& clear(uint32_t bitPosition) {
+    VAL &= ~(1ULL << (bitPosition));
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE ap_private ashr(uint32_t shiftAmt) const {
+    if (_AP_S)
+      return ap_private((shiftAmt == BitWidth) ? 0
+                                               : ((int64_t)VAL) >> (shiftAmt));
+    else
+      return ap_private((shiftAmt == BitWidth) ? 0
+                                               : ((uint64_t)VAL) >> (shiftAmt));
+  }
+
+  INLINE ap_private lshr(uint32_t shiftAmt) const {
+    return ap_private((shiftAmt == BitWidth)
+                          ? ap_private(0)
+                          : ap_private((VAL & mask) >> (shiftAmt)));
+  }
+
+  INLINE ap_private shl(uint32_t shiftAmt) const
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    if (shiftAmt > BitWidth) {
+      if (!isNegative())
+        return ap_private(0);
+      else
+        return ap_private(-1);
+    }
+    if (shiftAmt == BitWidth)
+      return ap_private(0);
+    else
+      return ap_private((VAL) << (shiftAmt));
+    // return ap_private((shiftAmt == BitWidth) ? ap_private(0ULL) :
+    // ap_private(VAL << shiftAmt));
+  }
+
+  INLINE int64_t getSExtValue() const { return VAL; }
+
+  // XXX XXX this function is used in CBE
+  INLINE uint64_t getZExtValue() const { return VAL & mask; }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ref.get();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ((uint64_t)(bool)ref);
+    check_canary();
+  }
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+//    set_canary();
+//    *this = ref.get();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = ((val.operator ap_private<_AP_W2, false>()));
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = (uint64_t)(bool)val;
+//    check_canary();
+//  }
+
+  INLINE void write(const ap_private<_AP_W, _AP_S>& op2) volatile {
+    *this = (op2);
+  }
+
+  // Explicit conversions to C interger types
+  //-----------------------------------------------------------
+  INLINE operator ValType() const { return get_VAL(); }
+
+  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
+
+  INLINE int to_char() const { return (signed char)get_VAL(); }
+
+  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
+
+  INLINE int to_short() const { return (short)get_VAL(); }
+
+  INLINE int to_int() const {
+    //      ap_private<64 /* _AP_W */, _AP_S> res(V);
+    return (int)get_VAL();
+  }
+
+  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
+
+  INLINE long to_long() const { return (long)get_VAL(); }
+
+  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
+
+  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
+
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
+
+  INLINE double to_double() const {
+    if (isNegative())
+      return roundToDouble(true);
+    else
+      return roundToDouble(false);
+  }
+
+  INLINE unsigned length() const { return _AP_W; }
+
+  INLINE bool isMinValue() const { return VAL == 0; }
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator&=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) & RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator|=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) | RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator^=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) ^ RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) * RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) + RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) - RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator&(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) &
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret & RHS;
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator^(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) ^
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret ^ RHS;
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator|(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) |
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret | RHS;
+    }
+  }
+
+  INLINE ap_private And(const ap_private& RHS) const {
+    return ap_private(VAL & RHS.get_VAL());
+  }
+
+  INLINE ap_private Or(const ap_private& RHS) const {
+    return ap_private(VAL | RHS.get_VAL());
+  }
+
+  INLINE ap_private Xor(const ap_private& RHS) const {
+    return ap_private(VAL ^ RHS.get_VAL());
+  }
+#if 1
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::mult_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::mult Result(((uint64_t)VAL) *
+                                                  RHS.get_VAL());
+      return Result;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::mult Result(*this);
+      Result *= RHS;
+      return Result;
+    }
+  }
+#endif
+  INLINE ap_private Mul(const ap_private& RHS) const {
+    return ap_private(VAL * RHS.get_VAL());
+  }
+
+  INLINE ap_private Add(const ap_private& RHS) const {
+    return ap_private(VAL + RHS.get_VAL());
+  }
+
+  INLINE ap_private Sub(const ap_private& RHS) const {
+    return ap_private(VAL - RHS.get_VAL());
+  }
+
+  INLINE ap_private& operator&=(uint64_t RHS) {
+    VAL &= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator|=(uint64_t RHS) {
+    VAL |= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator^=(uint64_t RHS) {
+    VAL ^= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator*=(uint64_t RHS) {
+    VAL *= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator+=(uint64_t RHS) {
+    VAL += (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator-=(uint64_t RHS) {
+    VAL -= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE bool isMinSignedValue() const {
+    static const uint64_t min_mask = ~(~0ULL << (_AP_W - 1));
+    return BitWidth == 1 ? VAL == 1
+                         : (ap_private_ops::isNegative<_AP_W>(*this) &&
+                            ((min_mask & VAL) == 0));
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::plus_w <= 64)
+      return typename RType<_AP_W1, _AP_S1>::plus(
+          RType<_AP_W1, _AP_S1>::plus_s
+              ? int64_t(((uint64_t)VAL) + RHS.get_VAL())
+              : uint64_t(((uint64_t)VAL) + RHS.get_VAL()));
+    typename RType<_AP_W1, _AP_S1>::plus Result = RHS;
+    Result += VAL;
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::minus_w <= 64)
+      return typename RType<_AP_W1, _AP_S1>::minus(
+          int64_t(((uint64_t)VAL) - RHS.get_VAL()));
+    typename RType<_AP_W1, _AP_S1>::minus Result = *this;
+    Result -= RHS;
+    return Result;
+  }
+
+  INLINE uint32_t countPopulation() const {
+    return ap_private_ops::CountPopulation_64(VAL);
+  }
+  INLINE uint32_t countLeadingZeros() const {
+    int remainder = BitWidth % 64;
+    int excessBits = (64 - remainder) % 64;
+    uint32_t Count = ap_private_ops::CountLeadingZeros_64(VAL);
+    if (Count) Count -= excessBits;
+    return AESL_std::min(Count, (uint32_t)_AP_W);
+  }
+
+  /// HiBits - This function returns the high "numBits" bits of this ap_private.
+  INLINE ap_private<_AP_W, _AP_S> getHiBits(uint32_t numBits) const {
+    ap_private<_AP_W, _AP_S> ret(*this);
+    ret = (ret) >> (BitWidth - numBits);
+    return ret;
+  }
+
+  /// LoBits - This function returns the low "numBits" bits of this ap_private.
+  INLINE ap_private<_AP_W, _AP_S> getLoBits(uint32_t numBits) const {
+    ap_private<_AP_W, _AP_S> ret(((uint64_t)VAL) << (BitWidth - numBits));
+    ret = (ret) >> (BitWidth - numBits);
+    return ret;
+    // return ap_private(numBits, (VAL << (BitWidth - numBits))>> (BitWidth -
+    // numBits));
+  }
+
+  INLINE ap_private<_AP_W, _AP_S>& set(uint32_t bitPosition) {
+    VAL |= (1ULL << (bitPosition));
+    clearUnusedBits();
+    return *this; // clearUnusedBits();
+  }
+
+  INLINE void set() {
+    VAL = (ValType)~0ULL;
+    clearUnusedBits();
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_private<_AP_W3, false>& val) {
+    operator=(ap_private<_AP_W3, _AP_S>(val));
+  }
+
+  INLINE void set(const ap_private& val) { operator=(val); }
+
+  INLINE void clearUnusedBits(void) volatile
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
+    VAL = (ValType)(
+        _AP_S
+            ? ((((int64_t)VAL) << (excess_bits)) >> (excess_bits))
+            : (excess_bits ? (((uint64_t)VAL) << (excess_bits)) >> (excess_bits)
+                           : (uint64_t)VAL));
+  }
+
+  INLINE void clearUnusedBitsToZero(void) {
+    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
+    static uint64_t mask = ~0ULL >> (excess_bits);
+    VAL &= mask;
+  }
+
+  INLINE ap_private udiv(const ap_private& RHS) const {
+    return ap_private((uint64_t)VAL / RHS.get_VAL());
+  }
+
+  /// Signed divide this ap_private by ap_private RHS.
+  /// @brief Signed division function for ap_private.
+  INLINE ap_private sdiv(const ap_private& RHS) const {
+    if (isNegative())
+      if (RHS.isNegative())
+        return ((uint64_t)(0 - (*this))) / (uint64_t)(0 - RHS);
+      else
+        return 0 - ((uint64_t)(0 - (*this)) / (uint64_t)(RHS));
+    else if (RHS.isNegative())
+      return 0 - (this->udiv((ap_private)(0 - RHS)));
+    return this->udiv(RHS);
+  }
+
+  template <bool _AP_S2>
+  INLINE ap_private urem(const ap_private<_AP_W, _AP_S2>& RHS) const {
+    assert(RHS.get_VAL() != 0 && "Divide by 0");
+    return ap_private(((uint64_t)VAL) % ((uint64_t)RHS.get_VAL()));
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  template <bool _AP_S2>
+  INLINE ap_private srem(const ap_private<_AP_W, _AP_S2>& RHS) const {
+    if (isNegative()) {
+      ap_private lhs = 0 - (*this);
+      if (RHS.isNegative()) {
+        ap_private rhs = 0 - RHS;
+        return 0 - (lhs.urem(rhs));
+      } else
+        return 0 - (lhs.urem(RHS));
+    } else if (RHS.isNegative()) {
+      ap_private rhs = 0 - RHS;
+      return this->urem(rhs);
+    }
+    return this->urem(RHS);
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool eq(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return (*this) == RHS;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ne(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered unsigned.
+  /// @brief Unsigned less than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ult(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (_AP_W1 <= 64) {
+      uint64_t lhsZext = ((uint64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
+      uint64_t rhsZext =
+          ((uint64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
+      return lhsZext < rhsZext;
+    } else
+      return RHS.uge(*this);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered signed.
+  /// @brief Signed less than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool slt(const ap_private<_AP_W1, _AP_S1>& RHS) const
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    if (_AP_W1 <= 64) {
+      int64_t lhsSext = ((int64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
+      int64_t rhsSext =
+          ((int64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
+      return lhsSext < rhsSext;
+    } else
+      return RHS.sge(*this);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered unsigned.
+  /// @brief Unsigned less or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ule(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return ult(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered signed.
+  /// @brief Signed less or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sle(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return slt(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered unsigned.
+  /// @brief Unsigned greather than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ugt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !ult(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered signed.
+  /// @brief Signed greather than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sgt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !slt(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered unsigned.
+  /// @brief Unsigned greater or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool uge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !ult(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered signed.
+  /// @brief Signed greather or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !slt(RHS);
+  }
+
+  INLINE ap_private abs() const {
+    if (isNegative()) return -(*this);
+    return *this;
+  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> ret(*this);
+    return ret;
+  }
+
+  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
+                                       uint8_t radix) {
+    return _AP_W;
+  }
+
+  INLINE uint32_t getActiveBits() const {
+    uint32_t bits = _AP_W - countLeadingZeros();
+    return bits ? bits : 1;
+  }
+
+  INLINE double roundToDouble(bool isSigned = false) const {
+    return isSigned ? double((int64_t)VAL) : double((uint64_t)VAL);
+  }
+
+  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
+   * versa*/
+  INLINE ap_private& reverse() {
+    for (int i = 0; i < _AP_W / 2; ++i) {
+      bool tmp = operator[](i);
+      if (operator[](_AP_W - 1 - i))
+        set(i);
+      else
+        clear(i);
+      if (tmp)
+        set(_AP_W - 1 - i);
+      else
+        clear(_AP_W - 1 - i);
+    }
+    clearUnusedBits();
+    return *this;
+  }
+
+  /*Return true if the value of ap_private instance is zero*/
+  INLINE bool iszero() const { return isMinValue(); }
+
+  INLINE bool to_bool() const { return !iszero(); }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (isNegative()) return true;
+    return false;
+  }
+
+  /* x[i] = !x[i] */
+  INLINE void invert(int i) {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    flip(i);
+  }
+
+  /* x[i] */
+  INLINE bool test(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return operator[](i);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the left
+  INLINE void lrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(shl(n) | lshr(_AP_W - n));
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the right
+  INLINE void rrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(lshr(n) | shl(_AP_W - n));
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return (((1ULL << i) & VAL) != 0);
+  }
+
+  /// Toggle all bits.
+  INLINE ap_private& flip() {
+    VAL = (ValType)((~0ULL ^ VAL) & mask);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// Toggles a given bit to its opposite value.
+  INLINE ap_private& flip(uint32_t bitPosition) {
+    assert(bitPosition < BitWidth && "Out of the bit-width range!");
+    set_bit(bitPosition, !get_bit(bitPosition));
+    return *this;
+  }
+
+  // complements every bit
+  INLINE void b_not() { flip(); }
+
+// Binary Arithmetic
+//-----------------------------------------------------------
+#define OP_BIN_AP(Sym, Rty, Fun)                           \
+  template <int _AP_W2, bool _AP_S2>                       \
+  INLINE typename RType<_AP_W2, _AP_S2>::Rty operator Sym( \
+      const ap_private<_AP_W2, _AP_S2>& op) const {        \
+    typename RType<_AP_W2, _AP_S2>::Rty lhs(*this);        \
+    typename RType<_AP_W2, _AP_S2>::Rty rhs(op);           \
+    return lhs.Fun(rhs);                                   \
+  }
+
+/// Bitwise and, or, xor
+// OP_BIN_AP(&,logic, And)
+// OP_BIN_AP(|,logic, Or)
+// OP_BIN_AP(^,logic, Xor)
+#undef OP_BIN_AP
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    return typename RType<_AP_W2, _AP_S2>::div(
+        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    typename RType<_AP_W2, _AP_S2>::mod res =
+        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
+                                                  : lhs.urem(rhs));
+    return res;
+  }
+
+#define OP_ASSIGN_AP_2(Sym)                         \
+  template <int _AP_W2, bool _AP_S2>                \
+  INLINE ap_private<_AP_W, _AP_S>& operator Sym##=( \
+      const ap_private<_AP_W2, _AP_S2>& op) {       \
+    *this = operator Sym(op);                       \
+    return *this;                                   \
+  }
+
+  OP_ASSIGN_AP_2(/)
+  OP_ASSIGN_AP_2(%)
+#undef OP_ASSIGN_AP_2
+
+/// Bitwise assign: and, or, xor
+//-------------------------------------------------------------
+//    OP_ASSIGN_AP(&)
+//    OP_ASSIGN_AP(^)
+//    OP_ASSIGN_AP(|)
+
+#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
+  INLINE ap_private operator<<(const TYPE op) const { \
+    if (op >= _AP_W) return ap_private(0);            \
+    if (SIGNED && op < 0) return *this >> (0 - op);   \
+    return shl(op);                                   \
+  }
+
+  // OP_LEFT_SHIFT_CTYPE(bool, false)
+  OP_LEFT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_LEFT_SHIFT_CTYPE(signed char, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
+  OP_LEFT_SHIFT_CTYPE(short, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
+  OP_LEFT_SHIFT_CTYPE(int, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
+  OP_LEFT_SHIFT_CTYPE(long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
+  OP_LEFT_SHIFT_CTYPE(long long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
+#if 0
+  OP_LEFT_SHIFT_CTYPE(half, false)
+  OP_LEFT_SHIFT_CTYPE(float, false)
+  OP_LEFT_SHIFT_CTYPE(double, false)
+#endif
+
+#undef OP_LEFT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this << sh;
+    } else {
+      int sh = op2.to_int();
+      return *this << sh;
+    }
+  }
+
+#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
+  INLINE ap_private operator>>(const TYPE op) const { \
+    if (op >= _AP_W) {                                \
+      if (isNegative())                               \
+        return ap_private(-1);                        \
+      else                                            \
+        return ap_private(0);                         \
+    }                                                 \
+    if ((SIGNED) && op < 0) return *this << (0 - op); \
+    if (_AP_S)                                        \
+      return ashr(op);                                \
+    else                                              \
+      return lshr(op);                                \
+  }
+
+  // OP_RIGHT_SHIFT_CTYPE(bool, false)
+  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_RIGHT_SHIFT_CTYPE(signed char, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
+  OP_RIGHT_SHIFT_CTYPE(short, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
+  OP_RIGHT_SHIFT_CTYPE(int, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
+  OP_RIGHT_SHIFT_CTYPE(long, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
+  OP_RIGHT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_RIGHT_SHIFT_CTYPE(half, false)
+  OP_RIGHT_SHIFT_CTYPE(float, false)
+  OP_RIGHT_SHIFT_CTYPE(double, false)
+#endif
+
+#undef OP_RIGHT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this >> sh;
+    } else {
+      int sh = op2.to_int();
+      return *this >> sh;
+    }
+  }
+
+  /// Shift assign
+  //-----------------------------------------------------------------
+
+  //INLINE const ap_private& operator<<=(uint32_t shiftAmt) {
+  //  VAL <<= shiftAmt;
+  //  clearUnusedBits();
+  //  return *this;
+  //}
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(int op) {                               \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }                                                                          \
+  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }                                                                          \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }
+
+  OP_ASSIGN_AP(>>)
+  OP_ASSIGN_AP(<<)
+#undef OP_ASSIGN_AP
+
+  /// Comparisons
+  //-----------------------------------------------------------------
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool operator==(const ap_private<_AP_W1, _AP_S1>& op) const {
+    enum { _AP_MAX_W = AP_MAX(AP_MAX(_AP_W, _AP_W1), 32) };
+    ap_private<_AP_MAX_W, false> lhs(*this);
+    ap_private<_AP_MAX_W, false> rhs(op);
+    if (_AP_MAX_W <= 64) {
+      return (uint64_t)lhs.get_VAL() == (uint64_t)rhs.get_VAL();
+    } else
+      return lhs == rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this == op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    // this will follow gcc rule for comparison
+    // between different bitwidth and signness
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
+    else if (_AP_W < 32 && _AP_W2 < 32)
+      // different signness but both bitwidth is less than 32
+      return lhs.sgt(rhs);
+    else
+        // different signness but bigger bitwidth
+        // is greater or equal to 32
+        if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ugt(rhs);
+      else
+        return lhs.sgt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ugt(rhs);
+    else
+      return lhs.sgt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this > op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
+    else if (_AP_W < 32 && _AP_W2 < 32)
+      return lhs.slt(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ult(rhs);
+      else
+        return lhs.slt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ult(rhs);
+    else
+      return lhs.slt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this < op);
+  }
+
+  /// Bit and Part Select
+  //--------------------------------------------------------------
+  // FIXME now _private_range_ref refs to _AP_ROOT_TYPE(struct ssdm_int).
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+//                                                                         a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+//                                                                       a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(
+//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this & a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this | a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this ^ a2.get();
+//  }
+
+  // Reduce operation
+  //-----------------------------------------------------------
+  INLINE bool and_reduce() const { return (VAL & mask) == mask; }
+
+  INLINE bool nand_reduce() const { return (VAL & mask) != mask; }
+
+  INLINE bool or_reduce() const { return (bool)VAL; }
+
+  INLINE bool nor_reduce() const { return VAL == 0; }
+
+  INLINE bool xor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? true : false;
+  }
+
+  INLINE bool xnor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? false : true;
+  }
+
+  INLINE std::string to_string(uint8_t radix = 2, bool sign = false) const {
+    return toString(radix, radix == 10 ? _AP_S : sign);
+  }
+}; // End of class ap_private <_AP_W, _AP_S, true>
+
+template <int _AP_W, bool _AP_S>
+std::string ap_private<_AP_W, _AP_S, true>::toString(uint8_t radix,
+                                                     bool wantSigned) const {
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "a", "b", "c", "d", "e", "f"};
+  std::string result;
+  if (radix != 10) {
+    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
+    // because the number of bits per digit (1,3 and 4 respectively) divides
+    // equaly. We just shift until there value is zero.
+
+    // First, check for a zero value and just short circuit the logic below.
+    if (*this == (uint64_t)(0)) {
+      // Always generate a radix indicator because fixed-point
+      // formats require it.
+      switch (radix) {
+        case 2:
+          result = "0b0";
+          break;
+        case 8:
+          result = "0o0";
+          break;
+        case 16:
+          result = "0x0";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+    } else {
+      ap_private<_AP_W, false, true> tmp(*this);
+      size_t insert_at = 0;
+      bool leading_zero = true;
+      if (wantSigned && isNegative()) {
+        // They want to print the signed version and it is a negative value
+        // Flip the bits and add one to turn it into the equivalent positive
+        // value and put a '-' in the result.
+        tmp.flip();
+        tmp++;
+        result = "-";
+        insert_at = 1;
+        leading_zero = false;
+      }
+      switch (radix) {
+        case 2:
+          result += "0b";
+          break;
+        case 8:
+          result += "0o";
+          break;
+        case 16:
+          result += "0x";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+      insert_at += 2;
+
+      // Just shift tmp right for each digit width until it becomes zero
+      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
+      uint64_t mask = radix - 1;
+      ap_private<_AP_W, false, true> zero(0);
+      unsigned bits = 0;
+      bool msb = false;
+      while (tmp.ne(zero)) {
+        unsigned digit = (unsigned)(tmp.get_VAL() & mask);
+        result.insert(insert_at, digits[digit]);
+        tmp = tmp.lshr(shift);
+        bits++;
+        msb = (digit >> (shift - 1)) == 1;
+      }
+      bits *= shift;
+      if (bits < _AP_W && leading_zero && msb)
+        result.insert(insert_at, digits[0]);
+    }
+    return result;
+  }
+
+  ap_private<_AP_W, false, true> tmp(*this);
+  ap_private<6, false, true> divisor(radix);
+  ap_private<_AP_W, _AP_S, true> zero(0);
+  size_t insert_at = 0;
+  if (wantSigned && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    tmp.flip();
+    tmp++;
+    result = "-";
+    insert_at = 1;
+  }
+  if (tmp == ap_private<_AP_W, false, true>(0ULL))
+    result = "0";
+  else
+    while (tmp.ne(zero)) {
+      ap_private<_AP_W, false, true> APdigit = tmp % divisor;
+      ap_private<_AP_W, false, true> tmp2 = tmp / divisor;
+      uint32_t digit = (uint32_t)(APdigit.getZExtValue());
+      assert(digit < radix && "divide failed");
+      result.insert(insert_at, digits[digit]);
+      tmp = tmp2;
+    }
+  return result;
+
+} // End of ap_private<_AP_W, _AP_S, true>::toString()
+
+// bitwidth > 64
+template <int _AP_W, bool _AP_S>
+class ap_private<_AP_W, _AP_S, false> {
+  // SFINAE pattern.  Only consider this class when _AP_W > 64
+  const static bool valid = ap_private_enable_if<(_AP_W > 64)>::isValid;
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+ public:
+  enum { BitWidth = _AP_W, _AP_N = (_AP_W + 63) / 64 };
+  static const int width = _AP_W;
+
+ private:
+  /// This constructor is used only internally for speed of construction of
+  /// temporaries. It is unsafe for general use so it is not public.
+
+  /* Constructors */
+  /// Note that numWords can be smaller or larger than the corresponding bit
+  /// width but any extraneous bits will be dropped.
+  /// @param numWords the number of words in bigVal
+  /// @param bigVal a sequence of words to form the initial value of the
+  /// ap_private
+  /// @brief Construct an ap_private, initialized as bigVal[].
+  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[]) {
+    set_canary();
+    assert(bigVal && "Null pointer detected!");
+    {
+      // Get memory, cleared to 0
+      memset(pVal, 0, _AP_N * sizeof(uint64_t));
+
+      // Calculate the number of words to copy
+      uint32_t words = AESL_std::min<uint32_t>(numWords, _AP_N);
+      // Copy the words from bigVal to pVal
+      memcpy(pVal, bigVal, words * APINT_WORD_SIZE);
+      if (words >= _AP_W) clearUnusedBits();
+      // Make sure unused high bits are cleared
+    }
+    check_canary();
+  }
+
+  /// This constructor interprets Val as a string in the given radix. The
+  /// interpretation stops when the first charater that is not suitable for the
+  /// radix is encountered. Acceptable radix values are 2, 8, 10 and 16. It is
+  /// an error for the value implied by the string to require more bits than
+  /// numBits.
+  /// @param val the string to be interpreted
+  /// @param radix the radix of Val to use for the intepretation
+  /// @brief Construct an ap_private from a string representation.
+  INLINE ap_private(const std::string& val, uint8_t radix = 2) {
+    set_canary();
+    assert(!val.empty() && "The input string is empty.");
+    const char* c_str = val.c_str();
+    fromString(c_str, val.size(), radix);
+    check_canary();
+  }
+
+  /// This constructor interprets the slen characters starting at StrStart as
+  /// a string in the given radix. The interpretation stops when the first
+  /// character that is not suitable for the radix is encountered. Acceptable
+  /// radix values are 2, 8, 10 and 16. It is an error for the value implied by
+  /// the string to require more bits than numBits.
+  /// @param strStart the start of the string to be interpreted
+  /// @param slen the maximum number of characters to interpret
+  /// @param radix the radix to use for the conversion
+  /// @brief Construct an ap_private from a string representation.
+  /// This method does not consider whether it is negative or not.
+  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix) {
+    set_canary();
+    fromString(strStart, slen, radix);
+    check_canary();
+  }
+
+  INLINE void report() {
+    _AP_ERROR(_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024,
+              "ap_%sint<%d>: Bitwidth exceeds the "
+              "default max value %d. Please use macro "
+              "AP_INT_MAX_W to set a larger max value.",
+              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
+  }
+  /// This union is used to store the integer value. When the
+  /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal.
+
+  /// This enum is used to hold the constants we needed for ap_private.
+  // uint64_t VAL;    ///< Used to store the <= 64 bits integer value.
+  uint64_t pVal[_AP_N]; ///< Used to store the >64 bits integer value.
+#ifdef AP_CANARY
+  uint64_t CANARY;
+  INLINE void check_canary() { assert(CANARY == (uint64_t)0xDEADBEEFDEADBEEF); }
+  INLINE void set_canary() { CANARY = (uint64_t)0xDEADBEEFDEADBEEF; }
+#else
+  INLINE void check_canary() {}
+  INLINE void set_canary() {}
+#endif
+
+ public:
+  typedef typename valtype<8, _AP_S>::Type ValType;
+  typedef ap_private<_AP_W, _AP_S> Type;
+  // FIXME remove friend type?
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  friend struct ap_fixed_base;
+  /// return type of variety of operations
+  //----------------------------------------------------------
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+    typedef ap_private<mult_w, mult_s> mult;
+    typedef ap_private<plus_w, plus_s> plus;
+    typedef ap_private<minus_w, minus_s> minus;
+    typedef ap_private<logic_w, logic_s> logic;
+    typedef ap_private<div_w, div_s> div;
+    typedef ap_private<mod_w, mod_s> mod;
+    typedef ap_private<_AP_W, _AP_S> arg1;
+    typedef bool reduce;
+  };
+
+  INLINE uint64_t& get_VAL(void) { return pVal[0]; }
+  INLINE uint64_t get_VAL(void) const { return pVal[0]; }
+  INLINE uint64_t get_VAL(void) const volatile { return pVal[0]; }
+  INLINE void set_VAL(uint64_t value) { pVal[0] = value; }
+  INLINE uint64_t& get_pVal(int index) { return pVal[index]; }
+  INLINE uint64_t* get_pVal() { return pVal; }
+  INLINE const uint64_t* get_pVal() const { return pVal; }
+  INLINE uint64_t get_pVal(int index) const { return pVal[index]; }
+  INLINE uint64_t* get_pVal() const volatile { return pVal; }
+  INLINE uint64_t get_pVal(int index) const volatile { return pVal[index]; }
+  INLINE void set_pVal(int i, uint64_t value) { pVal[i] = value; }
+
+  /// This enum is used to hold the constants we needed for ap_private.
+  enum {
+    APINT_BITS_PER_WORD = sizeof(uint64_t) * 8, ///< Bits in a word
+    APINT_WORD_SIZE = sizeof(uint64_t)          ///< Byte size of a word
+  };
+
+  enum {
+    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
+                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
+                      : 0
+  };
+  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
+
+ public:
+  // NOTE changed to explicit to be consistent with ap_private<W,S,true>
+  explicit INLINE ap_private(const char* val) {
+    set_canary();
+    unsigned char radix = 10;
+    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
+    std::string::size_type pos = str.find('.');
+    if (pos != std::string::npos) str = str.substr(pos);
+    ap_private ap_private_val(str, radix);
+    operator=(ap_private_val);
+    report();
+    check_canary();
+  }
+
+  INLINE ap_private(const char* val, unsigned char rd) {
+    set_canary();
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
+    std::string::size_type pos = str.find('.');
+    if (pos != std::string::npos) str = str.substr(pos);
+    ap_private ap_private_val(str, radix);
+    operator=(ap_private_val);
+    report();
+
+    report();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ref.get();
+    report();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ((uint64_t)(bool)ref);
+    report();
+    check_canary();
+  }
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+//    set_canary();
+//    *this = ref.get();
+//    report();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = ((val.operator ap_private<_AP_W2, false>()));
+//    report();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = (uint64_t)(bool)val;
+//    report();
+//    check_canary();
+//  }
+
+  /// Simply makes *this a copy of that.
+  /// @brief Copy Constructor.
+  INLINE ap_private(const ap_private& that) {
+      set_canary();
+      memcpy(pVal, that.get_pVal(), _AP_N * APINT_WORD_SIZE);
+      clearUnusedBits();
+      check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, false>& that) {
+    set_canary();
+    operator=(that);
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, false>& that) {
+    set_canary();
+    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, false>&>(that));
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, true>& that) {
+    set_canary();
+    static const uint64_t that_sign_ext_mask =
+        (_AP_W1 == APINT_BITS_PER_WORD)
+            ? 0
+            : ~0ULL >> (_AP_W1 % APINT_BITS_PER_WORD)
+                           << (_AP_W1 % APINT_BITS_PER_WORD);
+    if (that.isNegative()) {
+      pVal[0] = that.get_VAL() | that_sign_ext_mask;
+      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1));
+    } else {
+      pVal[0] = that.get_VAL();
+      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));
+    }
+    clearUnusedBits();
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, true>& that) {
+    set_canary();
+    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, true>&>(that));
+    check_canary();
+  }
+
+  /// @brief Destructor.
+  // virtual ~ap_private() {}
+  INLINE ~ap_private() { check_canary(); }
+
+  /// @name Constructors
+  /// @{
+
+  /// Default constructor that creates an uninitialized ap_private.  This is
+  /// useful
+  ///  for object deserialization (pair this with the static method Read).
+  INLINE ap_private() {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  INLINE ap_private(uint64_t* val, uint32_t bits = _AP_W) { assert(0); }
+  INLINE ap_private(const uint64_t* const val, uint32_t bits) { assert(0); }
+
+/// If isSigned is true then val is treated as if it were a signed value
+/// (i.e. as an int64_t) and the appropriate sign extension to the bit width
+/// will be done. Otherwise, no sign extension occurs (high order bits beyond
+/// the range of val are zero filled).
+/// @param numBits the bit width of the constructed ap_private
+/// @param val the initial value of the ap_private
+/// @param isSigned how to treat signedness of val
+/// @brief Create a new ap_private of numBits width, initialized as val.
+#define CTOR(TYPE, SIGNED)                                  \
+  INLINE ap_private(TYPE val, bool isSigned = SIGNED) {     \
+    set_canary();                                           \
+    pVal[0] = (ValType)val;                                 \
+    if (isSigned && int64_t(pVal[0]) < 0) {                 \
+      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1)); \
+    } else {                                                \
+      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));  \
+    }                                                       \
+    clearUnusedBits();                                      \
+    check_canary();                                         \
+  }
+
+  CTOR(bool, false)
+  CTOR(char, CHAR_IS_SIGNED)
+  CTOR(signed char, true)
+  CTOR(unsigned char, false)
+  CTOR(short, true)
+  CTOR(unsigned short, false)
+  CTOR(int, true)
+  CTOR(unsigned int, false)
+  CTOR(long, true)
+  CTOR(unsigned long, false)
+  CTOR(ap_slong, true)
+  CTOR(ap_ulong, false)
+#if 0
+  CTOR(half, false)
+  CTOR(float, false)
+  CTOR(double, false)
+#endif
+#undef CTOR
+
+  /// @returns true if the number of bits <= 64, false otherwise.
+  /// @brief Determine if this ap_private just has one word to store value.
+  INLINE bool isSingleWord() const { return false; }
+
+  /// @returns the word position for the specified bit position.
+  /// @brief Determine which word a bit is in.
+  static INLINE uint32_t whichWord(uint32_t bitPosition) {
+    //    return bitPosition / APINT_BITS_PER_WORD;
+    return (bitPosition) >> 6;
+  }
+
+  /// @returns the bit position in a word for the specified bit position
+  /// in the ap_private.
+  /// @brief Determine which bit in a word a bit is in.
+  static INLINE uint32_t whichBit(uint32_t bitPosition) {
+    //    return bitPosition % APINT_BITS_PER_WORD;
+    return bitPosition & 0x3f;
+  }
+
+  /// bit at a specific bit position. This is used to mask the bit in the
+  /// corresponding word.
+  /// @returns a uint64_t with only bit at "whichBit(bitPosition)" set
+  /// @brief Get a single bit mask.
+  static INLINE uint64_t maskBit(uint32_t bitPosition) {
+    return 1ULL << (whichBit(bitPosition));
+  }
+
+  /// @returns the corresponding word for the specified bit position.
+  /// @brief Get the word corresponding to a bit position
+  INLINE uint64_t getWord(uint32_t bitPosition) const {
+    return pVal[whichWord(bitPosition)];
+  }
+
+  /// This method is used internally to clear the to "N" bits in the high order
+  /// word that are not used by the ap_private. This is needed after the most
+  /// significant word is assigned a value to ensure that those bits are
+  /// zero'd out.
+  /// @brief Clear unused high order bits
+  INLINE void clearUnusedBits(void) volatile
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    pVal[_AP_N - 1] =
+        _AP_S ? ((((int64_t)pVal[_AP_N - 1]) << (excess_bits)) >> excess_bits)
+              : (excess_bits
+                     ? ((pVal[_AP_N - 1]) << (excess_bits)) >> (excess_bits)
+                     : pVal[_AP_N - 1]);
+  }
+
+  INLINE void clearUnusedBitsToZero(void) { pVal[_AP_N - 1] &= mask; }
+
+  INLINE void clearUnusedBitsToOne(void) { pVal[_AP_N - 1] |= mask; }
+
+  /// This is used by the constructors that take string arguments.
+  /// @brief Convert a char array into an ap_private
+  INLINE void fromString(const char* str, uint32_t slen, uint8_t radix) {
+    enum { numbits = _AP_W };
+    bool isNeg = str[0] == '-';
+    if (isNeg) {
+      str++;
+      slen--;
+    }
+
+    if (str[0] == '0' && (str[1] == 'b' || str[1] == 'B')) {
+      //if(radix == 0) radix = 2;
+      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", str, 2, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'o' || str[1] == 'O')) {
+      //if (radix == 0) radix = 8;
+      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", str, 8, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) {
+      //if (radix == 0) radix = 16;
+      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", str, 16, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'd' || str[1] == 'D')) {
+      //if (radix == 0) radix = 10;
+      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", str, 10, radix);
+      str += 2;
+      slen -=2;
+    } else if (radix == 0) {
+      //radix = 2; // XXX default value
+    }
+
+    // Check our assumptions here
+    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+           "Radix should be 2, 8, 10, or 16!");
+    assert(str && "String is null?");
+
+    // skip any leading zero
+    while (*str == '0' && *(str + 1) != '\0') {
+      str++;
+      slen--;
+    }
+    assert((slen <= numbits || radix != 2) && "Insufficient bit width");
+    assert(((slen - 1) * 3 <= numbits || radix != 8) &&
+           "Insufficient bit width");
+    assert(((slen - 1) * 4 <= numbits || radix != 16) &&
+           "Insufficient bit width");
+    assert((((slen - 1) * 64) / 22 <= numbits || radix != 10) &&
+           "Insufficient bit width");
+
+    // clear bits
+    memset(pVal, 0, _AP_N * sizeof(uint64_t));
+
+    // Figure out if we can shift instead of multiply
+    uint32_t shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+    // Set up an ap_private for the digit to add outside the loop so we don't
+    // constantly construct/destruct it.
+    uint64_t bigVal[_AP_N];
+    memset(bigVal, 0, _AP_N * sizeof(uint64_t));
+    ap_private<_AP_W, _AP_S> apdigit(getBitWidth(), bigVal);
+    ap_private<_AP_W, _AP_S> apradix(radix);
+
+    // Enter digit traversal loop
+    for (unsigned i = 0; i < slen; i++) {
+      // Get a digit
+      uint32_t digit = 0;
+      char cdigit = str[i];
+      if (radix == 16) {
+#define isxdigit(c)                                            \
+  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
+   ((c) >= 'A' && (c) <= 'F'))
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+        if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
+        if (isdigit(cdigit))
+          digit = cdigit - '0';
+        else if (cdigit >= 'a')
+          digit = cdigit - 'a' + 10;
+        else if (cdigit >= 'A')
+          digit = cdigit - 'A' + 10;
+        else
+          assert(0 && "huh? we shouldn't get here");
+      } else if (isdigit(cdigit)) {
+        digit = cdigit - '0';
+      } else if (cdigit != '\0') {
+        assert(0 && "Invalid character in digit string");
+      }
+#undef isxdigit
+#undef isdigit
+      // Shift or multiply the value by the radix
+      if (shift)
+        *this <<= shift;
+      else
+        *this *= apradix;
+
+      // Add in the digit we just interpreted
+      apdigit.set_VAL(digit);
+      *this += apdigit;
+    }
+    // If its negative, put it in two's complement form
+    if (isNeg) {
+      (*this)--;
+      this->flip();
+    }
+    clearUnusedBits();
+  }
+
+  INLINE ap_private read() volatile { return *this; }
+
+  INLINE void write(const ap_private& op2) volatile { *this = (op2); }
+
+  INLINE operator ValType() const { return get_VAL(); }
+
+  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
+
+  INLINE int to_char() const { return (signed char)get_VAL(); }
+
+  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
+
+  INLINE int to_short() const { return (short)get_VAL(); }
+
+  INLINE int to_int() const { return (int)get_VAL(); }
+
+  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
+
+  INLINE long to_long() const { return (long)get_VAL(); }
+
+  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
+
+  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
+
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
+
+  INLINE double to_double() const {
+    if (isNegative())
+      return roundToDouble(true);
+    else
+      return roundToDouble(false);
+  }
+
+  INLINE unsigned length() const { return _AP_W; }
+
+  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
+   * versa*/
+  INLINE ap_private& reverse() {
+    for (int i = 0; i < _AP_W / 2; ++i) {
+      bool tmp = operator[](i);
+      if (operator[](_AP_W - 1 - i))
+        set(i);
+      else
+        clear(i);
+      if (tmp)
+        set(_AP_W - 1 - i);
+      else
+        clear(_AP_W - 1 - i);
+    }
+    clearUnusedBits();
+    return *this;
+  }
+
+  /*Return true if the value of ap_private instance is zero*/
+  INLINE bool iszero() const { return isMinValue(); }
+
+  INLINE bool to_bool() const { return !iszero(); }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (isNegative()) return true;
+    return false;
+  }
+
+  /* x[i] = !x[i] */
+  INLINE void invert(int i) {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    flip(i);
+  }
+
+  /* x[i] */
+  INLINE bool test(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return operator[](i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // FIXME different argument for different action?
+  INLINE ap_private& set(uint32_t bitPosition) {
+    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE void set() {
+    for (int i = 0; i < _AP_N; ++i) pVal[i] = ~0ULL;
+    clearUnusedBits();
+  }
+
+  // Get the value of ith bit
+  INLINE bool get(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the left
+  INLINE void lrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(shl(n) | lshr(_AP_W - n));
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the right
+  INLINE void rrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(lshr(n) | shl(_AP_W - n));
+  }
+
+  /// Set the given bit to 0 whose position is given as "bitPosition".
+  /// @brief Set a given bit to 0.
+  INLINE ap_private& clear(uint32_t bitPosition) {
+    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @brief Set every bit to 0.
+  INLINE void clear() { memset(pVal, 0, _AP_N * APINT_WORD_SIZE); }
+
+  /// @brief Toggle every bit to its opposite value.
+  ap_private& flip() {
+    for (int i = 0; i < _AP_N; ++i) pVal[i] ^= ~0ULL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @brief Toggles a given bit to its opposite value.
+  INLINE ap_private& flip(uint32_t bitPosition) {
+    assert(bitPosition < BitWidth && "Out of the bit-width range!");
+    set_bit(bitPosition, !get_bit(bitPosition));
+    return *this;
+  }
+
+  // complements every bit
+  INLINE void b_not() { flip(); }
+
+  INLINE ap_private getLoBits(uint32_t numBits) const {
+    return ap_private_ops::lshr(ap_private_ops::shl(*this, _AP_W - numBits),
+                                _AP_W - numBits);
+  }
+
+  INLINE ap_private getHiBits(uint32_t numBits) const {
+    return ap_private_ops::lshr(*this, _AP_W - numBits);
+  }
+
+  // Binary Arithmetic
+  //-----------------------------------------------------------
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this & a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this | a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this ^ a2.get();
+//  }
+
+/// Arithmetic assign
+//-------------------------------------------------------------
+
+#define OP_BIN_LOGIC_ASSIGN_AP(Sym)                                            \
+  template <int _AP_W1, bool _AP_S1>                                           \
+  INLINE ap_private& operator Sym(const ap_private<_AP_W1, _AP_S1>& RHS) {     \
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                      \
+    uint32_t numWords = AESL_std::min((int)_AP_N, _AP_N1);                     \
+    uint32_t i;                                                                \
+    if (_AP_W != _AP_W1)                                                       \
+      fprintf(stderr,                                                          \
+              "Warning! Bitsize mismach for ap_[u]int " #Sym " ap_[u]int.\n"); \
+    for (i = 0; i < numWords; ++i) pVal[i] Sym RHS.get_pVal(i);                \
+    if (_AP_N1 < _AP_N) {                                                      \
+      uint64_t ext = RHS.isNegative() ? ~0ULL : 0;                             \
+      for (; i < _AP_N; i++) pVal[i] Sym ext;                                  \
+    }                                                                          \
+    clearUnusedBits();                                                         \
+    return *this;                                                              \
+  }
+
+  OP_BIN_LOGIC_ASSIGN_AP(&=);
+  OP_BIN_LOGIC_ASSIGN_AP(|=);
+  OP_BIN_LOGIC_ASSIGN_AP(^=);
+#undef OP_BIN_LOGIC_ASSIGN_AP
+
+  /// Adds the RHS APint to this ap_private.
+  /// @returns this, after addition of RHS.
+  /// @brief Addition assignment operator.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    uint64_t RHSpVal[_AP_N1];
+    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
+    ap_private_ops::add(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
+                        _AP_S1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    uint64_t RHSpVal[_AP_N1];
+    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
+    ap_private_ops::sub(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
+                        _AP_S1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    // Get some bit facts about LHS and check for zero
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
+    if (!lhsWords) {
+      // 0 * X ===> 0
+      return *this;
+    }
+
+    ap_private dupRHS = RHS;
+    // Get some bit facts about RHS and check for zero
+    uint32_t rhsBits = dupRHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
+    if (!rhsWords) {
+      // X * 0 ===> 0
+      clear();
+      return *this;
+    }
+
+    // Allocate space for the result
+    uint32_t destWords = rhsWords + lhsWords;
+    uint64_t* dest = (uint64_t*)malloc(destWords * sizeof(uint64_t));
+
+    // Perform the long multiply
+    ap_private_ops::mul(dest, pVal, lhsWords, dupRHS.get_pVal(), rhsWords,
+                        destWords);
+
+    // Copy result back into *this
+    clear();
+    uint32_t wordsToCopy = destWords >= _AP_N ? _AP_N : destWords;
+
+    memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
+
+    uint64_t ext = (isNegative() ^ RHS.isNegative()) ? ~0ULL : 0ULL;
+    for (int i = wordsToCopy; i < _AP_N; i++) pVal[i] = ext;
+    clearUnusedBits();
+    // delete dest array and return
+    free(dest);
+    return *this;
+  }
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }
+
+  OP_ASSIGN_AP(/)
+  OP_ASSIGN_AP(%)
+#undef OP_ASSIGN_AP
+
+#define OP_BIN_LOGIC_AP(Sym)                                                  \
+  template <int _AP_W1, bool _AP_S1>                                          \
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator Sym(                  \
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {                          \
+    enum {                                                                    \
+      numWords = (RType<_AP_W1, _AP_S1>::logic_w + APINT_BITS_PER_WORD - 1) / \
+                 APINT_BITS_PER_WORD                                          \
+    };                                                                        \
+    typename RType<_AP_W1, _AP_S1>::logic Result;                             \
+    uint32_t i;                                                               \
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                     \
+    uint32_t min_N = std::min((int)_AP_N, _AP_N1);                            \
+    uint32_t max_N = std::max((int)_AP_N, _AP_N1);                            \
+    for (i = 0; i < min_N; ++i)                                               \
+      Result.set_pVal(i, pVal[i] Sym RHS.get_pVal(i));                        \
+    if (numWords > i) {                                                       \
+      uint64_t ext = ((_AP_N < _AP_N1 && isNegative()) ||                     \
+                      (_AP_N1 < _AP_N && RHS.isNegative()))                   \
+                         ? ~0ULL                                              \
+                         : 0;                                                 \
+      if (_AP_N > _AP_N1)                                                     \
+        for (; i < max_N; i++) Result.set_pVal(i, pVal[i] Sym ext);           \
+      else                                                                    \
+        for (; i < max_N; i++) Result.set_pVal(i, RHS.get_pVal(i) Sym ext);   \
+      if (numWords > i) {                                                     \
+        uint64_t ext2 = ((_AP_N > _AP_N1 && isNegative()) ||                  \
+                         (_AP_N1 > _AP_N && RHS.isNegative()))                \
+                            ? ~0ULL                                           \
+                            : 0;                                              \
+        Result.set_pVal(i, ext Sym ext2);                                     \
+      }                                                                       \
+    }                                                                         \
+    Result.clearUnusedBits();                                                 \
+    return Result;                                                            \
+  }
+
+  OP_BIN_LOGIC_AP(|);
+  OP_BIN_LOGIC_AP(&);
+  OP_BIN_LOGIC_AP(^);
+
+#undef OP_BIN_LOGIC_AP
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::plus Result, lhs(*this), rhs(RHS);
+    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::plus_w + 63) / 64;
+    ap_private_ops::add(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
+                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::minus Result, lhs(*this), rhs(RHS);
+    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::minus_w + 63) / 64;
+    ap_private_ops::sub(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
+                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::mult temp = *this;
+    temp *= RHS;
+    return temp;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    return typename RType<_AP_W2, _AP_S2>::div(
+        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    typename RType<_AP_W2, _AP_S2>::mod res =
+        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
+                                                  : lhs.urem(rhs));
+    return res;
+  }
+
+#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
+  INLINE ap_private operator<<(const TYPE op) const { \
+    if (op >= _AP_W) return ap_private(0);            \
+    if (SIGNED && op < 0) return *this >> (0 - op);   \
+    return shl(op);                                   \
+  }
+
+  OP_LEFT_SHIFT_CTYPE(int, true)
+  // OP_LEFT_SHIFT_CTYPE(bool, false)
+  OP_LEFT_SHIFT_CTYPE(signed char, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
+  OP_LEFT_SHIFT_CTYPE(short, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
+  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
+  OP_LEFT_SHIFT_CTYPE(long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
+  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
+  OP_LEFT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_LEFT_SHIFT_CTYPE(half, false)
+  OP_LEFT_SHIFT_CTYPE(float, false)
+  OP_LEFT_SHIFT_CTYPE(double, false)
+#endif
+#undef OP_LEFT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this << sh;
+    } else {
+      int sh = op2.to_int();
+      return *this << sh;
+    }
+  }
+
+#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
+  INLINE ap_private operator>>(const TYPE op) const { \
+    if (op >= _AP_W) {                                \
+      if (isNegative())                               \
+        return ap_private(-1);                        \
+      else                                            \
+        return ap_private(0);                         \
+    }                                                 \
+    if ((SIGNED) && op < 0) return *this << (0 - op); \
+    if (_AP_S)                                        \
+      return ashr(op);                                \
+    else                                              \
+      return lshr(op);                                \
+  }
+
+  // OP_RIGHT_SHIFT_CTYPE(bool, false)
+  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_RIGHT_SHIFT_CTYPE(signed char, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
+  OP_RIGHT_SHIFT_CTYPE(short, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
+  OP_RIGHT_SHIFT_CTYPE(int, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
+  OP_RIGHT_SHIFT_CTYPE(long, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
+  OP_RIGHT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_RIGHT_SHIFT_CTYPE(half, false)
+  OP_RIGHT_SHIFT_CTYPE(float, false)
+  OP_RIGHT_SHIFT_CTYPE(double, false)
+#endif
+#undef OP_RIGHT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this >> sh;
+    } else {
+      int sh = op2.to_int();
+      return *this >> sh;
+    }
+  }
+
+  /// Shift assign
+  //------------------------------------------------------------------
+  // TODO call clearUnusedBits ?
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(int op) {                               \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }                                                                          \
+  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }                                                                          \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }
+  OP_ASSIGN_AP(>>)
+  OP_ASSIGN_AP(<<)
+#undef OP_ASSIGN_AP
+
+  /// Comparisons
+  //-----------------------------------------------------------------
+  INLINE bool operator==(const ap_private& RHS) const {
+    // Get some facts about the number of bits used in the two operands.
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 = RHS.getActiveBits();
+
+    // If the number of bits isn't the same, they aren't equal
+    if (n1 != n2) return false;
+
+    // If the number of bits fits in a word, we only need to compare the low
+    // word.
+    if (n1 <= APINT_BITS_PER_WORD) return pVal[0] == RHS.get_pVal(0);
+
+    // Otherwise, compare everything
+    for (int i = whichWord(n1 - 1); i >= 0; --i)
+      if (pVal[i] != RHS.get_pVal(i)) return false;
+    return true;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W, _AP_W2),
+    };
+    ap_private<_AP_MAX_W, false> lhs(*this);
+    ap_private<_AP_MAX_W, false> rhs(op);
+    return lhs == rhs;
+  }
+
+  INLINE bool operator==(uint64_t Val) const {
+    uint32_t n = getActiveBits();
+    if (n <= APINT_BITS_PER_WORD)
+      return pVal[0] == Val;
+    else
+      return false;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this == op);
+  }
+
+  template <bool _AP_S1>
+  INLINE bool operator!=(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  INLINE bool operator!=(uint64_t Val) const { return !((*this) == Val); }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this > op);
+  }
+
+  INLINE bool operator<(const ap_private& op) const {
+    return _AP_S ? slt(op) : ult(op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ult(rhs);
+      else
+        return lhs.slt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ult(rhs);
+    else
+      return lhs.slt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this < op);
+  }
+
+  INLINE bool operator>(const ap_private& op) const {
+    return _AP_S ? sgt(op) : ugt(op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ugt(rhs);
+      else
+        return lhs.sgt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ugt(rhs);
+    else
+      return lhs.sgt(rhs);
+  }
+
+  /// Bit and Part Select
+  //--------------------------------------------------------------
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> range(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> range(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(const_cast<ap_private*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+//                                                                         a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+//                                                                       a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(
+//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+//  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> ret(*this);
+    return ret;
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_private<_AP_W3, false>& val) {
+    operator=(ap_private<_AP_W3, _AP_S>(val));
+  }
+
+  ///
+  /// @name Value Tests
+  ///
+  /// This tests the high bit of this ap_private to determine if it is set.
+  /// @returns true if this ap_private is negative, false otherwise
+  /// @brief Determine sign of this ap_private.
+  INLINE bool isNegative() const {
+    // just for get rid of warnings
+    enum { shift = (_AP_W - APINT_BITS_PER_WORD * (_AP_N - 1) - 1) };
+    static const uint64_t mask = 1ULL << (shift);
+    return _AP_S && (pVal[_AP_N - 1] & mask);
+  }
+
+  /// This tests the high bit of the ap_private to determine if it is unset.
+  /// @brief Determine if this ap_private Value is positive (not negative).
+  INLINE bool isPositive() const { return !isNegative(); }
+
+  /// This tests if the value of this ap_private is strictly positive (> 0).
+  /// @returns true if this ap_private is Positive and not zero.
+  /// @brief Determine if this ap_private Value is strictly positive.
+  INLINE bool isStrictlyPositive() const {
+    return isPositive() && (*this) != 0;
+  }
+
+  /// This checks to see if the value has all bits of the ap_private are set or
+  /// not.
+  /// @brief Determine if all bits are set
+  INLINE bool isAllOnesValue() const { return countPopulation() == _AP_W; }
+
+  /// This checks to see if the value of this ap_private is the maximum unsigned
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the largest unsigned value.
+  INLINE bool isMaxValue() const { return countPopulation() == _AP_W; }
+
+  /// This checks to see if the value of this ap_private is the maximum signed
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the largest signed value.
+  INLINE bool isMaxSignedValue() const {
+    return !isNegative() && countPopulation() == _AP_W - 1;
+  }
+
+  /// This checks to see if the value of this ap_private is the minimum unsigned
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the smallest unsigned value.
+  INLINE bool isMinValue() const { return countPopulation() == 0; }
+
+  /// This checks to see if the value of this ap_private is the minimum signed
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the smallest signed value.
+  INLINE bool isMinSignedValue() const {
+    return isNegative() && countPopulation() == 1;
+  }
+
+  /// This function returns a pointer to the internal storage of the ap_private.
+  /// This is useful for writing out the ap_private in binary form without any
+  /// conversions.
+  INLINE const uint64_t* getRawData() const { return &pVal[0]; }
+
+  // Square Root - this method computes and returns the square root of "this".
+  // Three mechanisms are used for computation. For small values (<= 5 bits),
+  // a table lookup is done. This gets some performance for common cases. For
+  // values using less than 52 bits, the value is converted to double and then
+  // the libc sqrt function is called. The result is rounded and then converted
+  // back to a uint64_t which is then used to construct the result. Finally,
+  // the Babylonian method for computing square roots is used.
+  INLINE ap_private sqrt() const {
+    // Determine the magnitude of the value.
+    uint32_t magnitude = getActiveBits();
+
+    // Use a fast table for some small values. This also gets rid of some
+    // rounding errors in libc sqrt for small values.
+    if (magnitude <= 5) {
+      static const uint8_t results[32] = {
+          /*     0 */ 0,
+          /*  1- 2 */ 1, 1,
+          /*  3- 6 */ 2, 2, 2, 2,
+          /*  7-12 */ 3, 3, 3, 3, 3, 3,
+          /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
+          /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+          /*    31 */ 6};
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ results[get_VAL()]);
+    }
+
+    // If the magnitude of the value fits in less than 52 bits (the precision of
+    // an IEEE double precision floating point value), then we can use the
+    // libc sqrt function which will probably use a hardware sqrt computation.
+    // This should be faster than the algorithm below.
+    if (magnitude < 52) {
+#ifdef _MSC_VER
+      // Amazingly, VC++ doesn't have round().
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
+                                      uint64_t(::sqrt(double(get_VAL()))) +
+                                      0.5);
+#else
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
+                                      uint64_t(
+                                          ::round(::sqrt(double(get_VAL())))));
+#endif
+    }
+
+    // Okay, all the short cuts are exhausted. We must compute it. The following
+    // is a classical Babylonian method for computing the square root. This code
+    // was adapted to APINt from a wikipedia article on such computations.
+    // See http://www.wikipedia.org/ and go to the page named
+    // Calculate_an_integer_square_root.
+    uint32_t nbits = BitWidth, i = 4;
+    ap_private<_AP_W, _AP_S> testy(16);
+    ap_private<_AP_W, _AP_S> x_old(/*BitWidth,*/ 1);
+    ap_private<_AP_W, _AP_S> x_new(0);
+    ap_private<_AP_W, _AP_S> two(/*BitWidth,*/ 2);
+
+    // Select a good starting value using binary logarithms.
+    for (;; i += 2, testy = testy.shl(2))
+      if (i >= nbits || this->ule(testy)) {
+        x_old = x_old.shl(i / 2);
+        break;
+      }
+
+    // Use the Babylonian method to arrive at the integer square root:
+    for (;;) {
+      x_new = (this->udiv(x_old) + x_old).udiv(two);
+      if (x_old.ule(x_new)) break;
+      x_old = x_new;
+    }
+
+    // Make sure we return the closest approximation
+    // NOTE: The rounding calculation below is correct. It will produce an
+    // off-by-one discrepancy with results from pari/gp. That discrepancy has
+    // been
+    // determined to be a rounding issue with pari/gp as it begins to use a
+    // floating point representation after 192 bits. There are no discrepancies
+    // between this algorithm and pari/gp for bit widths < 192 bits.
+    ap_private<_AP_W, _AP_S> square(x_old * x_old);
+    ap_private<_AP_W, _AP_S> nextSquare((x_old + 1) * (x_old + 1));
+    if (this->ult(square))
+      return x_old;
+    else if (this->ule(nextSquare)) {
+      ap_private<_AP_W, _AP_S> midpoint((nextSquare - square).udiv(two));
+      ap_private<_AP_W, _AP_S> offset(*this - square);
+      if (offset.ult(midpoint))
+        return x_old;
+      else
+        return x_old + 1;
+    } else
+      assert(0 && "Error in ap_private<_AP_W, _AP_S>::sqrt computation");
+    return x_old + 1;
+  }
+
+  ///
+  /// @Assignment Operators
+  ///
+  /// @returns *this after assignment of RHS.
+  /// @brief Copy assignment operator.
+  INLINE ap_private& operator=(const ap_private& RHS) {
+    if (this != &RHS) memcpy(pVal, RHS.get_pVal(), _AP_N * APINT_WORD_SIZE);
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator=(const volatile ap_private& RHS) {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE void operator=(const ap_private& RHS) volatile {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+  }
+  INLINE void operator=(const volatile ap_private& RHS) volatile {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    if (_AP_S1)
+      cpSextOrTrunc(RHS);
+    else
+      cpZextOrTrunc(RHS);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
+    if (_AP_S1)
+      cpSextOrTrunc(RHS);
+    else
+      cpZextOrTrunc(RHS);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    *this = ap_private<_AP_W2, false>(op2);
+    return *this;
+  }
+
+#if 0
+    template<int _AP_W1, bool _AP_S1>
+    INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1, true>& RHS) {
+        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
+        if (RHS.isNegative()) {
+            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
+            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
+        } else {
+            pVal[0] = RHS.get_VAL();
+            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
+        }
+        clearUnusedBits();
+        return *this;
+    }
+
+    template<int _AP_W1, bool _AP_S1>
+    INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1, true>& RHS) {
+        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
+        if (RHS.isNegative()) {
+            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
+            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
+        } else {
+            pVal[0] = RHS.get_VAL();
+            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
+        }
+        clearUnusedBits();
+        return *this;
+    }
+#endif
+
+/// from all c types.
+#define ASSIGN_OP_FROM_INT(C_TYPE, _AP_W2, _AP_S2) \
+  INLINE ap_private& operator=(const C_TYPE rhs) { \
+    ap_private<(_AP_W2), (_AP_S2)> tmp = rhs;      \
+    operator=(tmp);                                \
+    return *this;                                  \
+  }
+
+  ASSIGN_OP_FROM_INT(bool, 1, false)
+  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  ASSIGN_OP_FROM_INT(signed char, 8, true)
+  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
+  ASSIGN_OP_FROM_INT(short, sizeof(short) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned short, sizeof(unsigned short) * 8, false)
+  ASSIGN_OP_FROM_INT(int, sizeof(int) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned int, sizeof(unsigned int) * 8, false)
+  ASSIGN_OP_FROM_INT(long, sizeof(long) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned long, sizeof(unsigned long) * 8, false)
+  ASSIGN_OP_FROM_INT(ap_slong, sizeof(ap_slong) * 8, true)
+  ASSIGN_OP_FROM_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef ASSIGN_OP_FROM_INT
+
+  /// from c string.
+  // XXX this is a must, to prevent pointer being converted to bool.
+  INLINE ap_private& operator=(const char* s) {
+    ap_private tmp(s); // XXX direct initialization, as ctor is explicit.
+    operator=(tmp);
+    return *this;
+  }
+
+  ///
+  /// @name Unary Operators
+  ///
+  /// @returns a new ap_private value representing *this incremented by one
+  /// @brief Postfix increment operator.
+  INLINE const ap_private operator++(int) {
+    ap_private API(*this);
+    ++(*this);
+    return API;
+  }
+
+  /// @returns *this incremented by one
+  /// @brief Prefix increment operator.
+  INLINE ap_private& operator++() {
+    ap_private_ops::add_1(pVal, pVal, _AP_N, 1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @returns a new ap_private representing *this decremented by one.
+  /// @brief Postfix decrement operator.
+  INLINE const ap_private operator--(int) {
+    ap_private API(*this);
+    --(*this);
+    return API;
+  }
+
+  /// @returns *this decremented by one.
+  /// @brief Prefix decrement operator.
+  INLINE ap_private& operator--() {
+    ap_private_ops::sub_1(pVal, _AP_N, 1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// Performs a bitwise complement operation on this ap_private.
+  /// @returns an ap_private that is the bitwise complement of *this
+  /// @brief Unary bitwise complement operator.
+  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
+    ap_private<_AP_W + !_AP_S, true> Result(*this);
+    Result.flip();
+    return Result;
+  }
+
+  /// Negates *this using two's complement logic.
+  /// @returns An ap_private value representing the negation of *this.
+  /// @brief Unary negation operator
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_private<1, false>(0) - (*this);
+  }
+
+  /// Performs logical negation operation on this ap_private.
+  /// @returns true if *this is zero, false otherwise.
+  /// @brief Logical negation operator.
+  INLINE bool operator!() const {
+    for (int i = 0; i < _AP_N; ++i)
+      if (pVal[i]) return false;
+    return true;
+  }
+
+  template <bool _AP_S1>
+  INLINE ap_private<_AP_W, _AP_S || _AP_S1> And(
+      const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator&(RHS);
+  }
+  template <bool _AP_S1>
+  INLINE ap_private Or(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator|(RHS);
+  }
+  template <bool _AP_S1>
+  INLINE ap_private Xor(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator^(RHS);
+  }
+
+  INLINE ap_private Mul(const ap_private& RHS) const {
+    ap_private Result(*this);
+    Result *= RHS;
+    return Result;
+  }
+
+  INLINE ap_private Add(const ap_private& RHS) const {
+    ap_private Result(0);
+    ap_private_ops::add(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
+                        _AP_N, _AP_S, _AP_S);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  INLINE ap_private Sub(const ap_private& RHS) const {
+    ap_private Result(0);
+    ap_private_ops::sub(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
+                        _AP_N, _AP_S, _AP_S);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  /// Arithmetic right-shift this ap_private by shiftAmt.
+  /// @brief Arithmetic right-shift function.
+  INLINE ap_private ashr(uint32_t shiftAmt) const {
+    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
+    // Handle a degenerate case
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // If all the bits were shifted out, the result is, technically, undefined.
+    // We return -1 if it was negative, 0 otherwise. We check this early to
+    // avoid
+    // issues in the algorithm below.
+    if (shiftAmt == BitWidth) {
+      if (isNegative())
+        return ap_private(-1);
+      else
+        return ap_private(0);
+    }
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+
+    // Compute some values needed by the following shift algorithms
+    uint32_t wordShift =
+        shiftAmt % APINT_BITS_PER_WORD;               // bits to shift per word
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
+    uint32_t breakWord = _AP_N - 1 - offset;          // last word affected
+    uint32_t bitsInWord = whichBit(BitWidth); // how many bits in last word?
+    if (bitsInWord == 0) bitsInWord = APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      // Move the words containing significant bits
+      for (uint32_t i = 0; i <= breakWord; ++i)
+        val[i] = pVal[i + offset]; // move whole word
+
+      // Adjust the top significant word for sign bit fill, if negative
+      if (isNegative())
+        if (bitsInWord < APINT_BITS_PER_WORD)
+          val[breakWord] |= ~0ULL << (bitsInWord); // set high bits
+    } else {
+      // Shift the low order words
+      for (uint32_t i = 0; i < breakWord; ++i) {
+        // This combines the shifted corresponding word with the low bits from
+        // the next word (shifted into this word's high bits).
+        val[i] = ((pVal[i + offset]) >> (wordShift));
+        val[i] |= ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
+      }
+
+      // Shift the break word. In this case there are no bits from the next word
+      // to include in this word.
+      val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
+
+      // Deal with sign extenstion in the break word, and possibly the word
+      // before
+      // it.
+      if (isNegative()) {
+        if (wordShift > bitsInWord) {
+          if (breakWord > 0)
+            val[breakWord - 1] |=
+                ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
+          val[breakWord] |= ~0ULL;
+        } else
+          val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
+      }
+    }
+
+    // Remaining words are 0 or -1, just assign them.
+    uint64_t fillValue = (isNegative() ? ~0ULL : 0);
+    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = fillValue;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  /// Logical right-shift this ap_private by shiftAmt.
+  /// @brief Logical right-shift function.
+  INLINE ap_private lshr(uint32_t shiftAmt) const {
+    // If all the bits were shifted out, the result is 0. This avoids issues
+    // with shifting by the size of the integer type, which produces undefined
+    // results. We define these "undefined results" to always be 0.
+    if (shiftAmt == BitWidth) return ap_private(0);
+
+    // If none of the bits are shifted out, the result is *this. This avoids
+    // issues with shifting byt he size of the integer type, which produces
+    // undefined results in the code below. This is also an optimization.
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+
+    // If we are shifting less than a word, compute the shift with a simple
+    // carry
+    if (shiftAmt < APINT_BITS_PER_WORD) {
+      uint64_t carry = 0;
+      for (int i = _AP_N - 1; i >= 0; --i) {
+        val[i] = ((pVal[i]) >> (shiftAmt)) | carry;
+        carry = (pVal[i]) << (APINT_BITS_PER_WORD - shiftAmt);
+      }
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Compute some values needed by the remaining shift algorithms
+    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      for (uint32_t i = 0; i < _AP_N - offset; ++i) val[i] = pVal[i + offset];
+      for (uint32_t i = _AP_N - offset; i < _AP_N; i++) val[i] = 0;
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Shift the low order words
+    uint32_t breakWord = _AP_N - offset - 1;
+    for (uint32_t i = 0; i < breakWord; ++i)
+      val[i] = ((pVal[i + offset]) >> (wordShift)) |
+               ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
+    // Shift the break word.
+    val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
+
+    // Remaining words are 0
+    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = 0;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  /// Left-shift this ap_private by shiftAmt.
+  /// @brief Left-shift function.
+  INLINE ap_private shl(uint32_t shiftAmt) const {
+    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
+    // If all the bits were shifted out, the result is 0. This avoids issues
+    // with shifting by the size of the integer type, which produces undefined
+    // results. We define these "undefined results" to always be 0.
+    if (shiftAmt == BitWidth) return ap_private(0);
+
+    // If none of the bits are shifted out, the result is *this. This avoids a
+    // lshr by the words size in the loop below which can produce incorrect
+    // results. It also avoids the expensive computation below for a common
+    // case.
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+    // If we are shifting less than a word, do it the easy way
+    if (shiftAmt < APINT_BITS_PER_WORD) {
+      uint64_t carry = 0;
+      for (int i = 0; i < _AP_N; i++) {
+        val[i] = ((pVal[i]) << (shiftAmt)) | carry;
+        carry = (pVal[i]) >> (APINT_BITS_PER_WORD - shiftAmt);
+      }
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Compute some values needed by the remaining shift algorithms
+    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      for (uint32_t i = 0; i < offset; i++) val[i] = 0;
+      for (int i = offset; i < _AP_N; i++) val[i] = pVal[i - offset];
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Copy whole words from this to Result.
+    uint32_t i = _AP_N - 1;
+    for (; i > offset; --i)
+      val[i] = (pVal[i - offset]) << (wordShift) |
+               (pVal[i - offset - 1]) >> (APINT_BITS_PER_WORD - wordShift);
+    val[offset] = (pVal[0]) << (wordShift);
+    for (i = 0; i < offset; ++i) val[i] = 0;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  INLINE ap_private rotl(uint32_t rotateAmt) const {
+    if (rotateAmt == 0) return ap_private(*this);
+    // Don't get too fancy, just use existing shift/or facilities
+    ap_private hi(*this);
+    ap_private lo(*this);
+    hi.shl(rotateAmt);
+    lo.lshr(BitWidth - rotateAmt);
+    return hi | lo;
+  }
+
+  INLINE ap_private rotr(uint32_t rotateAmt) const {
+    if (rotateAmt == 0) return ap_private(*this);
+    // Don't get too fancy, just use existing shift/or facilities
+    ap_private hi(*this);
+    ap_private lo(*this);
+    lo.lshr(rotateAmt);
+    hi.shl(BitWidth - rotateAmt);
+    return hi | lo;
+  }
+
+  /// Perform an unsigned divide operation on this ap_private by RHS. Both this
+  /// and
+  /// RHS are treated as unsigned quantities for purposes of this division.
+  /// @returns a new ap_private value containing the division result
+  /// @brief Unsigned division operation.
+  INLINE ap_private udiv(const ap_private& RHS) const {
+    // Get some facts about the LHS and RHS number of bits and words
+    uint32_t rhsBits = RHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Divided by zero???");
+    uint32_t lhsBits = this->getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+    // Deal with some degenerate cases
+    if (!lhsWords)
+      // 0 / X ===> 0
+      return ap_private(0);
+    else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X / Y ===> 0, iff X < Y
+      return ap_private(0);
+    } else if (*this == RHS) {
+      // X / X ===> 1
+      return ap_private(1);
+    } else if (lhsWords == 1 && rhsWords == 1) {
+      // All high words are zero, just use native divide
+      return ap_private(this->pVal[0] / RHS.get_pVal(0));
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Quotient(0); // to hold result.
+    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, &Quotient,
+                           (ap_private*)0);
+    return Quotient;
+  }
+
+  /// Signed divide this ap_private by ap_private RHS.
+  /// @brief Signed division function for ap_private.
+  INLINE ap_private sdiv(const ap_private& RHS) const {
+    if (isNegative())
+      if (RHS.isNegative())
+        return (-(*this)).udiv(-RHS);
+      else
+        return -((-(*this)).udiv(RHS));
+    else if (RHS.isNegative())
+      return -(this->udiv((ap_private)(-RHS)));
+    return this->udiv(RHS);
+  }
+
+  /// Perform an unsigned remainder operation on this ap_private with RHS being
+  /// the
+  /// divisor. Both this and RHS are treated as unsigned quantities for purposes
+  /// of this operation. Note that this is a true remainder operation and not
+  /// a modulo operation because the sign follows the sign of the dividend
+  /// which is *this.
+  /// @returns a new ap_private value containing the remainder result
+  /// @brief Unsigned remainder operation.
+  INLINE ap_private urem(const ap_private& RHS) const {
+    // Get some facts about the LHS
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+    // Get some facts about the RHS
+    uint32_t rhsBits = RHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Performing remainder operation by zero ???");
+
+    // Check the degenerate cases
+    if (lhsWords == 0) {
+      // 0 % Y ===> 0
+      return ap_private(0);
+    } else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X % Y ===> X, iff X < Y
+      return *this;
+    } else if (*this == RHS) {
+      // X % X == 0;
+      return ap_private(0);
+    } else if (lhsWords == 1) {
+      // All high words are zero, just use native remainder
+      return ap_private(pVal[0] % RHS.get_pVal(0));
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Remainder(0);
+    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, (ap_private*)(0),
+                           &Remainder);
+    return Remainder;
+  }
+
+  INLINE ap_private urem(uint64_t RHS) const {
+    // Get some facts about the LHS
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+    // Get some facts about the RHS
+    uint32_t rhsWords = 1; //! rhsBits ? 0 : (ap_private<_AP_W,
+                           //! _AP_S>::whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Performing remainder operation by zero ???");
+    // Check the degenerate cases
+    if (lhsWords == 0) {
+      // 0 % Y ===> 0
+      return ap_private(0);
+    } else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X % Y ===> X, iff X < Y
+      return *this;
+    } else if (*this == RHS) {
+      // X % X == 0;
+      return ap_private(0);
+    } else if (lhsWords == 1) {
+      // All high words are zero, just use native remainder
+      return ap_private(pVal[0] % RHS);
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Remainder(0);
+    divide(*this, lhsWords, RHS, (ap_private*)(0), &Remainder);
+    return Remainder;
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  INLINE ap_private srem(const ap_private& RHS) const {
+    if (isNegative()) {
+      ap_private lhs = -(*this);
+      if (RHS.isNegative()) {
+        ap_private rhs = -RHS;
+        return -(lhs.urem(rhs));
+      } else
+        return -(lhs.urem(RHS));
+    } else if (RHS.isNegative()) {
+      ap_private rhs = -RHS;
+      return this->urem(rhs);
+    }
+    return this->urem(RHS);
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  INLINE ap_private srem(int64_t RHS) const {
+    if (isNegative())
+      if (RHS < 0)
+        return -((-(*this)).urem(-RHS));
+      else
+        return -((-(*this)).urem(RHS));
+    else if (RHS < 0)
+      return this->urem(-RHS);
+    return this->urem(RHS);
+  }
+
+  /// Compares this ap_private with RHS for the validity of the equality
+  /// relationship.
+  /// @returns true if *this == Val
+  /// @brief Equality comparison.
+  template <bool _AP_S1>
+  INLINE bool eq(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return (*this) == RHS;
+  }
+
+  /// Compares this ap_private with RHS for the validity of the inequality
+  /// relationship.
+  /// @returns true if *this != Val
+  /// @brief Inequality comparison
+  template <bool _AP_S1>
+  INLINE bool ne(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered unsigned.
+  /// @brief Unsigned less than comparison
+  template <bool _AP_S1>
+  INLINE bool ult(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    // Get active bit length of both operands
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 = RHS.getActiveBits();
+
+    // If magnitude of LHS is less than RHS, return true.
+    if (n1 < n2) return true;
+
+    // If magnitude of RHS is greather than LHS, return false.
+    if (n2 < n1) return false;
+
+    // If they bot fit in a word, just compare the low order word
+    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+      return pVal[0] < RHS.get_pVal(0);
+
+    // Otherwise, compare all words
+    uint32_t topWord = whichWord(AESL_std::max(n1, n2) - 1);
+    for (int i = topWord; i >= 0; --i) {
+      if (pVal[i] > RHS.get_pVal(i)) return false;
+      if (pVal[i] < RHS.get_pVal(i)) return true;
+    }
+    return false;
+  }
+
+  INLINE bool ult(uint64_t RHS) const {
+    // Get active bit length of both operands
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 =
+        64 - ap_private_ops::CountLeadingZeros_64(RHS); // RHS.getActiveBits();
+
+    // If magnitude of LHS is less than RHS, return true.
+    if (n1 < n2) return true;
+
+    // If magnitude of RHS is greather than LHS, return false.
+    if (n2 < n1) return false;
+
+    // If they bot fit in a word, just compare the low order word
+    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+      return pVal[0] < RHS;
+    assert(0);
+  }
+
+  template <bool _AP_S1>
+  INLINE bool slt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    ap_private lhs(*this);
+    ap_private<_AP_W, _AP_S1> rhs(RHS);
+    bool lhsNeg = isNegative();
+    bool rhsNeg = rhs.isNegative();
+    if (lhsNeg) {
+      // Sign bit is set so perform two's complement to make it positive
+      lhs.flip();
+      lhs++;
+    }
+    if (rhsNeg) {
+      // Sign bit is set so perform two's complement to make it positive
+      rhs.flip();
+      rhs++;
+    }
+
+    // Now we have unsigned values to compare so do the comparison if necessary
+    // based on the negativeness of the values.
+    if (lhsNeg)
+      if (rhsNeg)
+        return lhs.ugt(rhs);
+      else
+        return true;
+    else if (rhsNeg)
+      return false;
+    else
+      return lhs.ult(rhs);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered unsigned.
+  /// @brief Unsigned less or equal comparison
+  template <bool _AP_S1>
+  INLINE bool ule(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return ult(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered signed.
+  /// @brief Signed less or equal comparison
+  template <bool _AP_S1>
+  INLINE bool sle(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return slt(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered unsigned.
+  /// @brief Unsigned greather than comparison
+  template <bool _AP_S1>
+  INLINE bool ugt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !ult(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered signed.
+  /// @brief Signed greather than comparison
+  template <bool _AP_S1>
+  INLINE bool sgt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !slt(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered unsigned.
+  /// @brief Unsigned greater or equal comparison
+  template <bool _AP_S1>
+  INLINE bool uge(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return !ult(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered signed.
+  /// @brief Signed greather or equal comparison
+  template <bool _AP_S1>
+  INLINE bool sge(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !slt(RHS);
+  }
+
+  // Sign extend to a new width.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpSext(const ap_private<_AP_W1, _AP_S1>& that) {
+    assert(_AP_W1 < BitWidth && "Invalid ap_private SignExtend request");
+    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
+    // If the sign bit isn't set, this is the same as zext.
+    if (!that.isNegative()) {
+      cpZext(that);
+      return;
+    }
+
+    // The sign bit is set. First, get some facts
+    enum { wordBits = _AP_W1 % APINT_BITS_PER_WORD };
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    // Mask the high order word appropriately
+    if (_AP_N1 == _AP_N) {
+      enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
+      // The extension is contained to the wordsBefore-1th word.
+      static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      pVal[_AP_N - 1] |= mask;
+      return;
+    }
+
+    enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
+    // The extension is contained to the wordsBefore-1th word.
+    static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
+    int i;
+    for (i = 0; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
+    pVal[i - 1] |= mask;
+    for (; i < _AP_N - 1; i++) pVal[i] = ~0ULL;
+    pVal[i] = ~0ULL;
+    clearUnusedBits();
+    return;
+  }
+
+  //  Zero extend to a new width.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpZext(const ap_private<_AP_W1, _AP_S1>& that) {
+    assert(_AP_W1 < BitWidth && "Invalid ap_private ZeroExtend request");
+    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    int i = 0;
+    for (; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
+    for (; i < _AP_N; ++i) pVal[i] = 0;
+    clearUnusedBits();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpZextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
+    if (BitWidth > _AP_W1)
+      cpZext(that);
+    else {
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      clearUnusedBits();
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpSextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
+    if (BitWidth > _AP_W1)
+      cpSext(that);
+    else {
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      clearUnusedBits();
+    }
+  }
+
+  /// @}
+  /// @name Value Characterization Functions
+  /// @{
+
+  /// @returns the total number of bits.
+  INLINE uint32_t getBitWidth() const { return BitWidth; }
+
+  /// Here one word's bitwidth equals to that of uint64_t.
+  /// @returns the number of words to hold the integer value of this ap_private.
+  /// @brief Get the number of words.
+  INLINE uint32_t getNumWords() const {
+    return (BitWidth + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
+  }
+
+  /// This function returns the number of active bits which is defined as the
+  /// bit width minus the number of leading zeros. This is used in several
+  /// computations to see how "wide" the value is.
+  /// @brief Compute the number of active bits in the value
+  INLINE uint32_t getActiveBits() const {
+    uint32_t bits = BitWidth - countLeadingZeros();
+    return bits ? bits : 1;
+  }
+
+  /// This method attempts to return the value of this ap_private as a zero
+  /// extended
+  /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
+  /// uint64_t. Otherwise an assertion will result.
+  /// @brief Get zero extended value
+  INLINE uint64_t getZExtValue() const {
+    assert(getActiveBits() <= 64 && "Too many bits for uint64_t");
+    return *pVal;
+  }
+
+  /// This method attempts to return the value of this ap_private as a sign
+  /// extended
+  /// int64_t. The bit width must be <= 64 or the value must fit within an
+  /// int64_t. Otherwise an assertion will result.
+  /// @brief Get sign extended value
+  INLINE int64_t getSExtValue() const {
+    assert(getActiveBits() <= 64 && "Too many bits for int64_t");
+    return int64_t(pVal[0]);
+  }
+
+  /// This method determines how many bits are required to hold the ap_private
+  /// equivalent of the string given by \p str of length \p slen.
+  /// @brief Get bits required for string value.
+  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
+                                       uint8_t radix) {
+    assert(str != 0 && "Invalid value string");
+    assert(slen > 0 && "Invalid string length");
+
+    // Each computation below needs to know if its negative
+    uint32_t isNegative = str[0] == '-';
+    if (isNegative) {
+      slen--;
+      str++;
+    }
+    // For radixes of power-of-two values, the bits required is accurately and
+    // easily computed
+    if (radix == 2) return slen + isNegative;
+    if (radix == 8) return slen * 3 + isNegative;
+    if (radix == 16) return slen * 4 + isNegative;
+
+    // Otherwise it must be radix == 10, the hard case
+    assert(radix == 10 && "Invalid radix");
+
+    // Convert to the actual binary value.
+    // ap_private<_AP_W, _AP_S> tmp(sufficient, str, slen, radix);
+
+    // Compute how many bits are required.
+    // return isNegative + tmp.logBase2() + 1;
+    return isNegative + slen * 4;
+  }
+
+  /// countLeadingZeros - This function is an ap_private version of the
+  /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number
+  /// of zeros from the most significant bit to the first one bit.
+  /// @returns BitWidth if the value is zero.
+  /// @returns the number of zeros from the most significant bit to the first
+  /// one bits.
+  INLINE uint32_t countLeadingZeros() const {
+    enum {
+      msw_bits = (BitWidth % APINT_BITS_PER_WORD)
+                     ? (BitWidth % APINT_BITS_PER_WORD)
+                     : APINT_BITS_PER_WORD,
+      excessBits = APINT_BITS_PER_WORD - msw_bits
+    };
+    uint32_t Count = ap_private_ops::CountLeadingZeros_64(pVal[_AP_N - 1]);
+    if (Count >= excessBits) Count -= excessBits;
+    if (!pVal[_AP_N - 1]) {
+      for (int i = _AP_N - 1; i; --i) {
+        if (!pVal[i - 1])
+          Count += APINT_BITS_PER_WORD;
+        else {
+          Count += ap_private_ops::CountLeadingZeros_64(pVal[i - 1]);
+          break;
+        }
+      }
+    }
+    return Count;
+  }
+
+  /// countLeadingOnes - This function counts the number of contiguous 1 bits
+  /// in the high order bits. The count stops when the first 0 bit is reached.
+  /// @returns 0 if the high order bit is not set
+  /// @returns the number of 1 bits from the most significant to the least
+  /// @brief Count the number of leading one bits.
+  INLINE uint32_t countLeadingOnes() const {
+    if (isSingleWord())
+      return countLeadingOnes_64(get_VAL(), APINT_BITS_PER_WORD - BitWidth);
+
+    uint32_t highWordBits = BitWidth % APINT_BITS_PER_WORD;
+    uint32_t shift =
+        (highWordBits == 0 ? 0 : APINT_BITS_PER_WORD - highWordBits);
+    int i = _AP_N - 1;
+    uint32_t Count = countLeadingOnes_64(get_pVal(i), shift);
+    if (Count == highWordBits) {
+      for (i--; i >= 0; --i) {
+        if (get_pVal(i) == ~0ULL)
+          Count += APINT_BITS_PER_WORD;
+        else {
+          Count += countLeadingOnes_64(get_pVal(i), 0);
+          break;
+        }
+      }
+    }
+    return Count;
+  }
+
+  /// countTrailingZeros - This function is an ap_private version of the
+  /// countTrailingZoers_{32,64} functions in MathExtras.h. It counts
+  /// the number of zeros from the least significant bit to the first set bit.
+  /// @returns BitWidth if the value is zero.
+  /// @returns the number of zeros from the least significant bit to the first
+  /// one bit.
+  /// @brief Count the number of trailing zero bits.
+  INLINE uint32_t countTrailingZeros() const {
+    uint32_t Count = 0;
+    uint32_t i = 0;
+    for (; i < _AP_N && get_pVal(i) == 0; ++i) Count += APINT_BITS_PER_WORD;
+    if (i < _AP_N) Count += ap_private_ops::CountTrailingZeros_64(get_pVal(i));
+    return AESL_std::min(Count, BitWidth);
+  }
+  /// countPopulation - This function is an ap_private version of the
+  /// countPopulation_{32,64} functions in MathExtras.h. It counts the number
+  /// of 1 bits in the ap_private value.
+  /// @returns 0 if the value is zero.
+  /// @returns the number of set bits.
+  /// @brief Count the number of bits set.
+  INLINE uint32_t countPopulation() const {
+    uint32_t Count = 0;
+    for (int i = 0; i < _AP_N - 1; ++i)
+      Count += ap_private_ops::CountPopulation_64(pVal[i]);
+    Count += ap_private_ops::CountPopulation_64(pVal[_AP_N - 1] & mask);
+    return Count;
+  }
+
+  /// @}
+  /// @name Conversion Functions
+  /// @
+
+  /// This is used internally to convert an ap_private to a string.
+  /// @brief Converts an ap_private to a std::string
+  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
+
+  /// Considers the ap_private to be unsigned and converts it into a string in
+  /// the
+  /// radix given. The radix can be 2, 8, 10 or 16.
+  /// @returns a character interpretation of the ap_private
+  /// @brief Convert unsigned ap_private to string representation.
+  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
+    return toString(radix, false);
+  }
+
+  /// Considers the ap_private to be unsigned and converts it into a string in
+  /// the
+  /// radix given. The radix can be 2, 8, 10 or 16.
+  /// @returns a character interpretation of the ap_private
+  /// @brief Convert unsigned ap_private to string representation.
+  INLINE std::string toStringSigned(uint8_t radix = 10) const {
+    return toString(radix, true);
+  }
+
+  /// @brief Converts this ap_private to a double value.
+  INLINE double roundToDouble(bool isSigned) const {
+    // Handle the simple case where the value is contained in one uint64_t.
+    if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
+      uint64_t val = pVal[0];
+      if (isSigned) {
+        int64_t sext = ((int64_t(val)) << (64 - BitWidth)) >> (64 - BitWidth);
+        return double(sext);
+      } else
+        return double(val);
+    }
+
+    // Determine if the value is negative.
+    bool isNeg = isSigned ? (*this)[BitWidth - 1] : false;
+
+    // Construct the absolute value if we're negative.
+    ap_private<_AP_W, _AP_S> Tmp(isNeg ? -(*this) : (*this));
+
+    // Figure out how many bits we're using.
+    uint32_t n = Tmp.getActiveBits();
+
+    // The exponent (without bias normalization) is just the number of bits
+    // we are using. Note that the sign bit is gone since we constructed the
+    // absolute value.
+    uint64_t exp = n;
+
+    // Return infinity for exponent overflow
+    if (exp > 1023) {
+      if (!isSigned || !isNeg)
+        return std::numeric_limits<double>::infinity();
+      else
+        return -std::numeric_limits<double>::infinity();
+    }
+    exp += 1023; // Increment for 1023 bias
+
+    // Number of bits in mantissa is 52. To obtain the mantissa value, we must
+    // extract the high 52 bits from the correct words in pVal.
+    uint64_t mantissa;
+    unsigned hiWord = whichWord(n - 1);
+    if (hiWord == 0) {
+      mantissa = Tmp.get_pVal(0);
+      if (n > 52)
+        (mantissa) >>= (n - 52); // shift down, we want the top 52 bits.
+    } else {
+      assert(hiWord > 0 && "High word is negative?");
+      uint64_t hibits = (Tmp.get_pVal(hiWord))
+                        << (52 - n % APINT_BITS_PER_WORD);
+      uint64_t lobits =
+          (Tmp.get_pVal(hiWord - 1)) >> (11 + n % APINT_BITS_PER_WORD);
+      mantissa = hibits | lobits;
+    }
+
+    // The leading bit of mantissa is implicit, so get rid of it.
+    uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0;
+    union {
+      double __D;
+      uint64_t __I;
+    } __T;
+    __T.__I = sign | ((exp) << 52) | mantissa;
+    return __T.__D;
+  }
+
+  /// @brief Converts this unsigned ap_private to a double value.
+  INLINE double roundToDouble() const { return roundToDouble(false); }
+
+  /// @brief Converts this signed ap_private to a double value.
+  INLINE double signedRoundToDouble() const { return roundToDouble(true); }
+
+  /// The conversion does not do a translation from integer to double, it just
+  /// re-interprets the bits as a double. Note that it is valid to do this on
+  /// any bit width. Exactly 64 bits will be translated.
+  /// @brief Converts ap_private bits to a double
+  INLINE double bitsToDouble() const {
+    union {
+      uint64_t __I;
+      double __D;
+    } __T;
+    __T.__I = pVal[0];
+    return __T.__D;
+  }
+
+  /// The conversion does not do a translation from integer to float, it just
+  /// re-interprets the bits as a float. Note that it is valid to do this on
+  /// any bit width. Exactly 32 bits will be translated.
+  /// @brief Converts ap_private bits to a double
+  INLINE float bitsToFloat() const {
+    union {
+      uint32_t __I;
+      float __F;
+    } __T;
+    __T.__I = uint32_t(pVal[0]);
+    return __T.__F;
+  }
+
+  /// The conversion does not do a translation from double to integer, it just
+  /// re-interprets the bits of the double. Note that it is valid to do this on
+  /// any bit width but bits from V may get truncated.
+  /// @brief Converts a double to ap_private bits.
+  INLINE ap_private& doubleToBits(double __V) {
+    union {
+      uint64_t __I;
+      double __D;
+    } __T;
+    __T.__D = __V;
+    pVal[0] = __T.__I;
+    return *this;
+  }
+
+  /// The conversion does not do a translation from float to integer, it just
+  /// re-interprets the bits of the float. Note that it is valid to do this on
+  /// any bit width but bits from V may get truncated.
+  /// @brief Converts a float to ap_private bits.
+  INLINE ap_private& floatToBits(float __V) {
+    union {
+      uint32_t __I;
+      float __F;
+    } __T;
+    __T.__F = __V;
+    pVal[0] = __T.__I;
+  }
+
+  // Reduce operation
+  //-----------------------------------------------------------
+  INLINE bool and_reduce() const { return isMaxValue(); }
+
+  INLINE bool nand_reduce() const { return isMinValue(); }
+
+  INLINE bool or_reduce() const { return (bool)countPopulation(); }
+
+  INLINE bool nor_reduce() const { return countPopulation() == 0; }
+
+  INLINE bool xor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? true : false;
+  }
+
+  INLINE bool xnor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? false : true;
+  }
+  INLINE std::string to_string(uint8_t radix = 16, bool sign = false) const {
+    return toString(radix, radix == 10 ? _AP_S : sign);
+  }
+}; // End of class ap_private <_AP_W, _AP_S, false>
+
+namespace ap_private_ops {
+
+enum { APINT_BITS_PER_WORD = 64 };
+template <int _AP_W, bool _AP_S>
+INLINE bool operator==(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
+  return V2 == V1;
+}
+
+template <int _AP_W, bool _AP_S>
+INLINE bool operator!=(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
+  return V2 != V1;
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE bool get(const ap_private<_AP_W, _AP_S>& a) {
+  static const uint64_t mask = 1ULL << (index & 0x3f);
+  return ((mask & a.get_pVal((index) >> 6)) != 0);
+}
+
+template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
+INLINE void set(ap_private<_AP_W, _AP_S>& a,
+                const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
+                const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    lsb_word = lsb_index / APINT_BITS_PER_WORD,
+    msb_word = msb_index / APINT_BITS_PER_WORD,
+    msb = msb_index % APINT_BITS_PER_WORD,
+    lsb = lsb_index % APINT_BITS_PER_WORD
+  };
+  if (msb_word == lsb_word) {
+    const uint64_t mask = ~0ULL >>
+                          (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
+                          (APINT_BITS_PER_WORD - msb - 1);
+    // a.set_pVal(msb_word, a.get_pVal(msb_word)  | mask);
+    a.get_pVal(msb_word) |= mask;
+  } else {
+    const uint64_t lsb_mask = ~0ULL >> (lsb) << (lsb);
+    const uint64_t msb_mask = ~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
+                              (APINT_BITS_PER_WORD - msb - 1);
+    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) | lsb_mask);
+    a.get_pVal(lsb_word) |= lsb_mask;
+    for (int i = lsb_word + 1; i < msb_word; i++) {
+      a.set_pVal(i, ~0ULL);
+      // a.get_pVal(i)=0;
+    }
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) | msb_mask);
+
+    a.get_pVal(msb_word) |= msb_mask;
+  }
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
+INLINE void clear(ap_private<_AP_W, _AP_S>& a,
+                  const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
+                  const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    lsb_word = lsb_index / APINT_BITS_PER_WORD,
+    msb_word = msb_index / APINT_BITS_PER_WORD,
+    msb = msb_index % APINT_BITS_PER_WORD,
+    lsb = lsb_index % APINT_BITS_PER_WORD
+  };
+  if (msb_word == lsb_word) {
+    const uint64_t mask =
+        ~(~0ULL >> (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
+          (APINT_BITS_PER_WORD - msb - 1));
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) & mask);
+    a.get_pVal(msb_word) &= mask;
+  } else {
+    const uint64_t lsb_mask = ~(~0ULL >> (lsb) << (lsb));
+    const uint64_t msb_mask = ~(~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
+                                (APINT_BITS_PER_WORD - msb - 1));
+    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) & lsb_mask);
+    a.get_pVal(lsb_word) &= lsb_mask;
+    for (int i = lsb_word + 1; i < msb_word; i++) {
+      // a.set_pVal(i, 0);
+      a.get_pVal(i) = 0;
+    }
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) & msb_mask);
+    a.get_pVal(msb_word) &= msb_mask;
+  }
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE void set(ap_private<_AP_W, _AP_S>& a,
+                const ap_private<AP_MAX(index, 1), true>& mark = 0) {
+  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
+  static const uint64_t mask = 1ULL << (index % APINT_BITS_PER_WORD);
+  // a.set_pVal(word, a.get_pVal(word) | mask);
+  a.get_pVal(word) |= mask;
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE void clear(ap_private<_AP_W, _AP_S>& a,
+                  const ap_private<AP_MAX(index, 1), true>& mark = 0) {
+  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
+  static const uint64_t mask = ~(1ULL << (index % APINT_BITS_PER_WORD));
+  // a.set_pVal(word, a.get_pVal(word) & mask);
+  a.get_pVal(word) &= mask;
+  a.clearUnusedBits();
+}
+
+} // End of ap_private_ops namespace
+
+template <int _AP_W, bool _AP_S>
+INLINE std::string ap_private<_AP_W, _AP_S, false>::toString(
+    uint8_t radix, bool wantSigned) const {
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "A", "B", "C", "D", "E", "F"};
+  std::string result;
+
+  if (radix != 10) {
+    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
+    // because the number of bits per digit (1,3 and 4 respectively) divides
+    // equaly. We just shift until there value is zero.
+
+    // First, check for a zero value and just short circuit the logic below.
+    if (*this == (uint64_t)(0))
+      result = "0";
+    else {
+      ap_private<_AP_W, false> tmp(*this);
+      size_t insert_at = 0;
+      bool leading_zero = true;
+      if (wantSigned && isNegative()) {
+        // They want to print the signed version and it is a negative value
+        // Flip the bits and add one to turn it into the equivalent positive
+        // value and put a '-' in the result.
+        tmp.flip();
+        tmp++;
+        tmp.clearUnusedBitsToZero();
+        result = "-";
+        insert_at = 1;
+        leading_zero = false;
+      }
+      switch (radix) {
+        case 2:
+          result += "0b";
+          break;
+        case 8:
+          result += "0o";
+          break;
+        case 16:
+          result += "0x";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+      insert_at += 2;
+      // Just shift tmp right for each digit width until it becomes zero
+      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
+      uint64_t mask = radix - 1;
+      ap_private<_AP_W, false> zero(0);
+      unsigned bits = 0;
+      while (tmp.ne(zero)) {
+        uint64_t digit = tmp.get_VAL() & mask;
+        result.insert(insert_at, digits[digit]);
+        tmp = tmp.lshr(shift);
+        ++bits;
+      }
+      bits *= shift;
+      if (bits < _AP_W && leading_zero) result.insert(insert_at, digits[0]);
+    }
+    return result;
+  }
+
+  ap_private<_AP_W, false> tmp(*this);
+  ap_private<_AP_W, false> divisor(radix);
+  ap_private<_AP_W, false> zero(0);
+  size_t insert_at = 0;
+  if (wantSigned && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    tmp.flip();
+    tmp++;
+    tmp.clearUnusedBitsToZero();
+    result = "-";
+    insert_at = 1;
+  }
+  if (tmp == ap_private<_AP_W, false>(0))
+    result = "0";
+  else
+    while (tmp.ne(zero)) {
+      ap_private<_AP_W, false> APdigit(0);
+      ap_private<_AP_W, false> tmp2(0);
+      ap_private_ops::divide(tmp, tmp.getNumWords(), divisor,
+                             divisor.getNumWords(), &tmp2, &APdigit);
+      uint64_t digit = APdigit.getZExtValue();
+      assert(digit < radix && "divide failed");
+      result.insert(insert_at, digits[digit]);
+      tmp = tmp2;
+    }
+
+  return result;
+} // End of ap_private<_AP_W, _AP_S, false>::toString()
+
+template <int _AP_W, bool _AP_S>
+std::ostream &operator<<(std::ostream &os, const ap_private<_AP_W, _AP_S> &x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.toString(16, false); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.toString(8, false); // don't print sign
+  } else {
+    os << x.toString(10, _AP_S);
+  }
+  return os;
+}
+
+// ------------------------------------------------------------ //
+//           XXX moved here from ap_int_sim.h  XXX              //
+// ------------------------------------------------------------ //
+
+/// Concatination reference.
+/// Proxy class which allows concatination to be used as rvalue(for reading) and
+/// lvalue(for writing)
+// ----------------------------------------------------------------
+// template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+// struct ap_concat_ref {
+//#ifdef _MSC_VER
+//#pragma warning(disable : 4521 4522)
+//#endif
+//  enum {
+//    _AP_WR = _AP_W1 + _AP_W2,
+//  };
+//  _AP_T1& mbv1;
+//  _AP_T2& mbv2;
+//
+//  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>&
+//  ref)
+//      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
+//
+//  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const ap_private<_AP_W3, _AP_S3>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
+//    int W_ref1 = mbv1.length();
+//    int W_ref2 = mbv2.length();
+//    ap_private<_AP_W1, false> mask1(-1);
+//    mask1 >>= _AP_W1 - W_ref1;
+//    ap_private<_AP_W2, false> mask2(-1);
+//    mask2 >>= _AP_W2 - W_ref2;
+//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
+//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
+//    return *this;
+//  }
+//
+//  INLINE ap_concat_ref& operator=(unsigned long long val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE ap_concat_ref& operator=(
+//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  INLINE ap_concat_ref& operator=(
+//      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const _private_bit_ref<_AP_W3, _AP_S3>&
+//  val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const _private_range_ref<_AP_W3, _AP_S3>&
+//  val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val)
+//      {
+//    return operator=((const ap_private<_AP_W3, false>)(val));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
+//          val) {
+//    return operator=(val.to_ap_private());
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+//    return operator=((unsigned long long)(bool)(val));
+//  }
+//
+//  INLINE operator ap_private<_AP_WR, false>() const { return get(); }
+//
+//  INLINE operator unsigned long long() const { return get().to_uint64(); }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                       _private_range_ref<_AP_W3, _AP_S3> >
+//  operator,(const _private_range_ref<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         _private_range_ref<_AP_W3, _AP_S3> >(
+//        *this, const_cast<_private_range_ref<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
+//      >
+//      operator,(ap_private<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         ap_private<_AP_W3, _AP_S3> >(*this, a2);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
+//      >
+//      operator,(const ap_private<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         ap_private<_AP_W3, _AP_S3> >(
+//        *this, const_cast<ap_private<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
+//  _AP_S3> >
+//  operator,(const _private_bit_ref<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
+//    _AP_S3> >(
+//        *this, const_cast<_private_bit_ref<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+//                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
+//  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+//                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
+//        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4,
+//        _AP_T4>&>(a2));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref<
+//      _AP_WR, ap_concat_ref, _AP_W3,
+//      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+//  operator,(
+//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2)
+//      {
+//    return ap_concat_ref<
+//        _AP_WR, ap_concat_ref, _AP_W3,
+//        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+//        *this,
+//        const_cast<
+//            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//            _AP_N3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
+//                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
+//                    >
+//      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//      _AP_N3>
+//                    &a2) {
+//    return ap_concat_ref<
+//        _AP_WR, ap_concat_ref, 1,
+//        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+//        *this,
+//        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//        _AP_N3>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return get() & a2;
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return get() | a2;
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3>(get() ^ a2);
+//  }
+//
+//  INLINE const ap_private<_AP_WR, false> get() const {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
+//    int W_ref2 = mbv2.length();
+//    tmpVal <<= W_ref2;
+//    tmpVal |= tmpVal2;
+//    return tmpVal;
+//  }
+//
+//  INLINE const ap_private<_AP_WR, false> get() {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
+//    int W_ref2 = mbv2.length();
+//    tmpVal <<= W_ref2;
+//    tmpVal |= tmpVal2;
+//    return tmpVal;
+//  }
+//
+//  template <int _AP_W3>
+//  INLINE void set(const ap_private<_AP_W3, false>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
+//    int W_ref1 = mbv1.length();
+//    int W_ref2 = mbv2.length();
+//    ap_private<_AP_W1, false> mask1(-1);
+//    mask1 >>= _AP_W1 - W_ref1;
+//    ap_private<_AP_W2, false> mask2(-1);
+//    mask2 >>= _AP_W2 - W_ref2;
+//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
+//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
+//  }
+//
+//  INLINE int length() const { return mbv1.length() + mbv2.length(); }
+//
+//  INLINE std::string to_string(uint8_t radix = 2) const {
+//    return get().to_string(radix);
+//  }
+//}; // struct ap_concat_ref.
+
+/// Range(slice) reference
+/// Proxy class, which allows part selection to be used as rvalue(for reading)
+/// and lvalue(for writing)
+//------------------------------------------------------------
+template <int _AP_W, bool _AP_S>
+struct _private_range_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  ap_private<_AP_W, _AP_S>& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  /// copy ctor.
+  INLINE _private_range_ref(const _private_range_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  /// direct ctor.
+  INLINE _private_range_ref(ap_private<_AP_W, _AP_S>* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound (%d) and lower bound (%d) cannot be "
+                "negative.",
+                h, l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound (%d) or lower bound (%d) out of range (%d).", h, l,
+                _AP_W);
+  }
+
+  /// compound or assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() &= "
+                "ap_private<>.range().");
+    this->d_bv |= ref.d_bv;
+    return *this;
+  }
+
+  /// compound or assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() |= _AP_ROOT_TYPE<>.");
+    this->d_bv |= ref.V;
+    return *this;
+  }
+
+  /// compound and assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() &= "
+                "ap_private<>.range().");
+    this->d_bv &= ref.d_bv;
+    return *this;
+  };
+
+  /// compound and assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() &= _AP_ROOT_TYPE<>.");
+    this->d_bv &= ref.V;
+    return *this;
+  }
+
+  /// compound xor assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() ^= "
+                "ap_private<>.range().");
+    this->d_bv ^= ref.d_bv;
+    return *this;
+  };
+
+  /// compound xor assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() ^= _AP_ROOT_TYPE<>.");
+    this->d_bv ^= ref.V;
+    return *this;
+  }
+
+  /// @name convertors.
+  //  @{
+  INLINE operator ap_private<_AP_W, false>() const {
+    ap_private<_AP_W, false> val(0);
+    if (h_index >= l_index) {
+      if (_AP_W > 64) {
+        val = d_bv;
+        ap_private<_AP_W, false> mask(-1);
+        mask >>= _AP_W - (h_index - l_index + 1);
+        val >>= l_index;
+        val &= mask;
+      } else {
+        const static uint64_t mask = (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
+        val = (d_bv >> l_index) & (mask >> (_AP_W - (h_index - l_index + 1)));
+      }
+    } else {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    }
+    return val;
+  }
+
+  INLINE operator unsigned long long() const { return to_uint64(); }
+  //  @}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(const ap_private<_AP_W2, _AP_S2>& val) {
+    ap_private<_AP_W, false> vval = ap_private<_AP_W, false>(val);
+    if (l_index > h_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
+    } else {
+      if (_AP_W > 64) {
+        ap_private<_AP_W, false> mask(-1);
+        if (l_index > 0) {
+          mask <<= l_index;
+          vval <<= l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          ap_private<_AP_W, false> mask2(-1);
+          mask2 >>= _AP_W - h_index - 1;
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask.flip();
+        d_bv &= mask;
+        d_bv |= vval;
+      } else {
+        unsigned shift = 64 - _AP_W;
+        uint64_t mask = ~0ULL >> (shift);
+        if (l_index > 0) {
+          vval = mask & vval << l_index;
+          mask = mask & mask << l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          uint64_t mask2 = mask;
+          mask2 >>= (_AP_W - h_index - 1);
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask = ~mask;
+        d_bv &= mask;
+        d_bv |= vval;
+      }
+    }
+    return *this;
+  } // operator=(const ap_private<>&)
+
+  INLINE _private_range_ref& operator=(unsigned long long val) {
+    const ap_private<_AP_W, _AP_S> vval = val;
+    return operator=(vval);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(
+      const _private_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((unsigned long long)(bool)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(
+      const _private_range_ref<_AP_W2, _AP_S2>& val) {
+    const ap_private<_AP_W, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE _private_range_ref& operator=(
+//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+//    const ap_private<_AP_W, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+
+  // TODO from ap_int_base, ap_bit_ref and ap_range_ref.
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(val.to_ap_int_base().V);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(val.operator ap_int_base<_AP_W2, false>().V);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((unsigned long long)(bool)val);
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        *this, const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  INLINE
+//  ap_concat_ref<_AP_W, _private_range_ref, _AP_W, ap_private<_AP_W, _AP_S> >
+//  operator,(ap_private<_AP_W, _AP_S>& a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W,
+//                         ap_private<_AP_W, _AP_S> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        *this, const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, _private_range_ref, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, _private_range_ref, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        *this,
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) {
+//    return ap_concat_ref<
+//        _AP_W, _private_range_ref, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        *this,
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs == rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs != rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs > rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs >= rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs < rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs <= rhs;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_private<_AP_W2, false>& val) {
+    ap_private<_AP_W, _AP_S> vval = val;
+    if (l_index > h_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
+    } else {
+      if (_AP_W > 64) {
+        ap_private<_AP_W, _AP_S> mask(-1);
+        if (l_index > 0) {
+          ap_private<_AP_W, false> mask1(-1);
+          mask1 >>= _AP_W - l_index;
+          mask1.flip();
+          mask = mask1;
+          // vval&=mask1;
+          vval <<= l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          ap_private<_AP_W, false> mask2(-1);
+          mask2 <<= h_index + 1;
+          mask2.flip();
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask.flip();
+        d_bv &= mask;
+        d_bv |= vval;
+      } else {
+        uint64_t mask = ~0ULL >> (64 - _AP_W);
+        if (l_index > 0) {
+          uint64_t mask1 = mask;
+          mask1 = mask & (mask1 >> (_AP_W - l_index));
+          vval = mask & (vval << l_index);
+          mask = ~mask1 & mask;
+          // vval&=mask1;
+        }
+        if (h_index < _AP_W - 1) {
+          uint64_t mask2 = ~0ULL >> (64 - _AP_W);
+          mask2 = mask & (mask2 << (h_index + 1));
+          mask &= ~mask2;
+          vval &= ~mask2;
+        }
+        d_bv &= (~mask & (~0ULL >> (64 - _AP_W)));
+        d_bv |= vval;
+      }
+    }
+  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> val(0);
+    if (h_index < l_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    } else {
+      val = d_bv;
+      val >>= l_index;
+      if (h_index < _AP_W - 1) {
+        if (_AP_W <= 64) {
+          const static uint64_t mask =
+              (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
+          val &= (mask >> (_AP_W - (h_index - l_index + 1)));
+        } else {
+          ap_private<_AP_W, false> mask(-1);
+          mask >>= _AP_W - (h_index - l_index + 1);
+          val &= mask;
+        }
+      }
+    }
+    return val;
+  }
+
+  INLINE ap_private<_AP_W, false> get() {
+    ap_private<_AP_W, false> val(0);
+    if (h_index < l_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    } else {
+      val = d_bv;
+      val >>= l_index;
+      if (h_index < _AP_W - 1) {
+        if (_AP_W <= 64) {
+          static const uint64_t mask = ~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0);
+          return val &= ((mask) >> (_AP_W - (h_index - l_index + 1)));
+        } else {
+          ap_private<_AP_W, false> mask(-1);
+          mask >>= _AP_W - (h_index - l_index + 1);
+          val &= mask;
+        }
+      }
+    }
+    return val;
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+  INLINE int to_int() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_int();
+  }
+
+  INLINE unsigned int to_uint() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_uint();
+  }
+
+  INLINE long to_long() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_long();
+  }
+
+  INLINE unsigned long to_ulong() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_ulong();
+  }
+
+  INLINE ap_slong to_int64() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_int64();
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_uint64();
+  }
+
+  INLINE std::string to_string(uint8_t radix = 2) const {
+    return get().to_string(radix);
+  }
+
+  INLINE bool and_reduce() {
+    bool ret = true;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret &= d_bv[i];
+    return ret;
+  }
+
+  INLINE bool or_reduce() {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret |= d_bv[i];
+    return ret;
+  }
+
+  INLINE bool xor_reduce() {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret ^= d_bv[i];
+    return ret;
+  }
+}; // struct _private_range_ref.
+
+/// Bit reference
+/// Proxy class, which allows bit selection to be used as rvalue(for reading)
+/// and lvalue(for writing)
+//--------------------------------------------------------------
+template <int _AP_W, bool _AP_S>
+struct _private_bit_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  ap_private<_AP_W, _AP_S>& d_bv;
+  int d_index;
+
+ public:
+  // copy ctor.
+  INLINE _private_bit_ref(const _private_bit_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {}
+
+  // director ctor.
+  INLINE _private_bit_ref(ap_private<_AP_W, _AP_S>& bv, int index = 0)
+      : d_bv(bv), d_index(index) {
+    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.\n",
+                d_index);
+    _AP_WARNING(d_index >= _AP_W,
+                "Index of bit vector (%d) out of range (%d).\n", d_index, _AP_W);
+  }
+
+  INLINE operator bool() const { return d_bv.get_bit(d_index); }
+
+  INLINE bool to_bool() const { return operator bool(); }
+
+  template <typename T>
+  INLINE _private_bit_ref& operator=(const T& val) {
+    if (!!val)
+      d_bv.set(d_index);
+    else
+      d_bv.clear(d_index);
+    return *this;
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
+//  _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
+//    _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2,
+//  _private_range_ref<_AP_W2,
+//  _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2,
+//    _private_range_ref<_AP_W2,
+//    _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref<_AP_W2,
+//  _AP_S2> > operator,(
+//      const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1,
+//    _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>
+//  operator,(
+//      const _private_bit_ref &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      1, _private_bit_ref, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//  _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        1, _private_bit_ref, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//            _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<1, _private_bit_ref, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//                    _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//      _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1, af_bit_ref<_AP_W2,
+//    _AP_I2, _AP_S2,
+//                                                      _AP_Q2, _AP_O2,
+//                                                      _AP_N2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//        _AP_N2>&>(
+//            a2));
+//  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
+    return get() != op.get();
+  }
+
+  INLINE bool get() const { return operator bool(); }
+
+  //  template <int _AP_W3>
+  //  INLINE void set(const ap_private<_AP_W3, false>& val) {
+  //    operator=(val);
+  //  }
+
+  //  INLINE bool operator~() const {
+  //    bool bit = (d_bv)[d_index];
+  //    return bit ? false : true;
+  //  }
+
+  INLINE int length() const { return 1; }
+
+  //  INLINE std::string to_string() const {
+  //    bool val = get();
+  //    return val ? "1" : "0";
+  //  }
+
+}; // struct _private_bit_ref.
+
+// char a[100];
+// char* ptr = a;
+// ap_int<2> n = 3;
+// char* ptr2 = ptr + n*2;
+// avoid ambiguous errors
+#define OP_BIN_MIX_PTR(BIN_OP)                                           \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
+  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                       \
+                                   const ap_private<_AP_W, _AP_S>& op) { \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
+    return i_op BIN_OP op2;                                              \
+  }                                                                      \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
+  INLINE PTR_TYPE* operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
+                                   PTR_TYPE* i_op) {                     \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
+    return op2 BIN_OP i_op;                                              \
+  }
+
+OP_BIN_MIX_PTR(+)
+OP_BIN_MIX_PTR(-)
+#undef OP_BIN_MIX_PTR
+
+// float OP ap_int
+// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
+#define OP_BIN_MIX_FLOAT(BIN_OP, C_TYPE)                              \
+  template <int _AP_W, bool _AP_S>                                    \
+  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                          \
+                                const ap_private<_AP_W, _AP_S>& op) { \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
+    return i_op BIN_OP op2;                                           \
+  }                                                                   \
+  template <int _AP_W, bool _AP_S>                                    \
+  INLINE C_TYPE operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
+                                C_TYPE i_op) {                        \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
+    return op2 BIN_OP i_op;                                           \
+  }
+
+#define OPS_MIX_FLOAT(C_TYPE) \
+  OP_BIN_MIX_FLOAT(*, C_TYPE) \
+  OP_BIN_MIX_FLOAT(/, C_TYPE) \
+  OP_BIN_MIX_FLOAT(+, C_TYPE) \
+  OP_BIN_MIX_FLOAT(-, C_TYPE)
+
+OPS_MIX_FLOAT(float)
+OPS_MIX_FLOAT(double)
+#undef OP_BIN_MIX_FLOAT
+#undef OPS_MIX_FLOAT
+
+/// Operators mixing Integers with AP_Int
+// ----------------------------------------------------------------
+
+// partially specialize template argument _AP_C in order that:
+// for _AP_W > 64, we will explicitly convert operand with native data type
+// into corresponding ap_private
+// for _AP_W <= 64, we will implicitly convert operand with ap_private into
+// (unsigned) long long
+#define OP_BIN_MIX_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                  \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_WI, _AP_SI>::template RType<_AP_W, _AP_S>::RTYPE \
+      operator BIN_OP(C_TYPE i_op, const ap_private<_AP_W, _AP_S>& op) {       \
+    return ap_private<_AP_WI, _AP_SI>(i_op).operator BIN_OP(op);               \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
+      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
+    return op.operator BIN_OP(ap_private<_AP_WI, _AP_SI>(i_op));               \
+  }
+
+#define OP_REL_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                     \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_private<_AP_W, _AP_S>& op,          \
+                              C_TYPE op2) {                                \
+    return op.operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));            \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(C_TYPE op2,                                  \
+                              const ap_private<_AP_W, _AP_S, false>& op) { \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(op);            \
+  }
+
+#define OP_ASSIGN_MIX_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
+  template <int _AP_W, bool _AP_S>                                 \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(             \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
+    return op.operator ASSIGN_OP(ap_private<_AP_W2, _AP_S2>(op2)); \
+  }
+
+#define OP_BIN_SHIFT_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                \
+  template <int _AP_W, bool _AP_S>                                             \
+  C_TYPE operator BIN_OP(C_TYPE i_op,                                          \
+                         const ap_private<_AP_W, _AP_S, false>& op) {          \
+    return i_op BIN_OP(op.get_VAL());                                          \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
+      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
+    return op.operator BIN_OP(i_op);                                           \
+  }
+
+#define OP_ASSIGN_RSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
+  template <int _AP_W, bool _AP_S>                              \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
+    op = op.operator>>(op2);                                    \
+    return op;                                                  \
+  }
+
+#define OP_ASSIGN_LSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
+  template <int _AP_W, bool _AP_S>                              \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
+    op = op.operator<<(op2);                                    \
+    return op;                                                  \
+  }
+
+#define OPS_MIX_INT(C_TYPE, _AP_W2, _AP_S2)              \
+  OP_BIN_MIX_INT(*, C_TYPE, (_AP_W2), (_AP_S2), mult)    \
+  OP_BIN_MIX_INT(+, C_TYPE, (_AP_W2), (_AP_S2), plus)    \
+  OP_BIN_MIX_INT(-, C_TYPE, (_AP_W2), (_AP_S2), minus)   \
+  OP_BIN_MIX_INT(/, C_TYPE, (_AP_W2), (_AP_S2), div)     \
+  OP_BIN_MIX_INT(%, C_TYPE, (_AP_W2), (_AP_S2), mod)     \
+  OP_BIN_MIX_INT(&, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
+  OP_BIN_MIX_INT(|, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
+  OP_BIN_MIX_INT (^, C_TYPE, (_AP_W2), (_AP_S2), logic)  \
+  OP_BIN_SHIFT_INT(>>, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
+  OP_BIN_SHIFT_INT(<<, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
+                                                         \
+  OP_ASSIGN_MIX_INT(+=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(-=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(*=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(/=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(%=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(&=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(|=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(^=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_RSHIFT_INT(>>=, C_TYPE, (_AP_W2), (_AP_S2))  \
+  OP_ASSIGN_LSHIFT_INT(<<=, C_TYPE, (_AP_W2), (_AP_S2))  \
+                                                         \
+  OP_REL_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))          \
+  OP_REL_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))          \
+  OP_REL_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+OPS_MIX_INT(bool, 1, false)
+OPS_MIX_INT(char, 8, CHAR_IS_SIGNED)
+OPS_MIX_INT(signed char, 8, true)
+OPS_MIX_INT(unsigned char, 8, false)
+OPS_MIX_INT(short, sizeof(short) * 8, true)
+OPS_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+OPS_MIX_INT(int, sizeof(int) * 8, true)
+OPS_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+OPS_MIX_INT(long, sizeof(long) * 8, true)
+OPS_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+OPS_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+OPS_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+
+#undef OP_BIN_MIX_INT
+#undef OP_BIN_SHIFT_INT
+#undef OP_ASSIGN_MIX_INT
+#undef OP_ASSIGN_RSHIFT_INT
+#undef OP_ASSIGN_LSHIFT_INT
+#undef OP_REL_MIX_INT
+#undef OPS_MIX_INT
+
+#define OP_BIN_MIX_RANGE(BIN_OP, RTYPE)                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                             _AP_S2>::RTYPE \
+  operator BIN_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,            \
+                  const ap_private<_AP_W2, _AP_S2>& op2) {                  \
+    return ap_private<_AP_W1, false>(op1).operator BIN_OP(op2);             \
+  }                                                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                             _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                    \
+                  const _private_range_ref<_AP_W2, _AP_S2>& op2) {          \
+    return op1.operator BIN_OP(ap_private<_AP_W2, false>(op2));             \
+  }
+
+#define OP_ASSIGN_MIX_RANGE(ASSIGN_OP)                             \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
+  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(           \
+      ap_private<_AP_W1, _AP_S1>& op1,                             \
+      const _private_range_ref<_AP_W2, _AP_S2>& op2) {             \
+    return op1.operator ASSIGN_OP(ap_private<_AP_W2, false>(op2)); \
+  }                                                                \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
+  INLINE _private_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(   \
+      _private_range_ref<_AP_W1, _AP_S1>& op1,                     \
+      ap_private<_AP_W2, _AP_S2>& op2) {                           \
+    ap_private<_AP_W1, false> tmp(op1);                            \
+    tmp.operator ASSIGN_OP(op2);                                   \
+    op1 = tmp;                                                     \
+    return op1;                                                    \
+  }
+
+#define OP_REL_MIX_RANGE(REL_OP)                                               \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE bool operator REL_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,   \
+                              const ap_private<_AP_W2, _AP_S2>& op2) {         \
+    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);                \
+  }                                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,           \
+                              const _private_range_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1.operator REL_OP(op2.operator ap_private<_AP_W2, false>());      \
+  }
+
+OP_BIN_MIX_RANGE(+, plus)
+OP_BIN_MIX_RANGE(-, minus)
+OP_BIN_MIX_RANGE(*, mult)
+OP_BIN_MIX_RANGE(/, div)
+OP_BIN_MIX_RANGE(%, mod)
+OP_BIN_MIX_RANGE(&, logic)
+OP_BIN_MIX_RANGE(|, logic)
+OP_BIN_MIX_RANGE(^, logic)
+OP_BIN_MIX_RANGE(>>, arg1)
+OP_BIN_MIX_RANGE(<<, arg1)
+#undef OP_BIN_MIX_RANGE
+
+OP_ASSIGN_MIX_RANGE(+=)
+OP_ASSIGN_MIX_RANGE(-=)
+OP_ASSIGN_MIX_RANGE(*=)
+OP_ASSIGN_MIX_RANGE(/=)
+OP_ASSIGN_MIX_RANGE(%=)
+OP_ASSIGN_MIX_RANGE(&=)
+OP_ASSIGN_MIX_RANGE(|=)
+OP_ASSIGN_MIX_RANGE(^=)
+OP_ASSIGN_MIX_RANGE(>>=)
+OP_ASSIGN_MIX_RANGE(<<=)
+#undef OP_ASSIGN_MIX_RANGE
+
+OP_REL_MIX_RANGE(>)
+OP_REL_MIX_RANGE(<)
+OP_REL_MIX_RANGE(>=)
+OP_REL_MIX_RANGE(<=)
+OP_REL_MIX_RANGE(==)
+OP_REL_MIX_RANGE(!=)
+#undef OP_REL_MIX_RANGE
+
+#define OP_BIN_MIX_BIT(BIN_OP, RTYPE)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
+  INLINE typename ap_private<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+  operator BIN_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,                \
+                  const ap_private<_AP_W2, _AP_S2>& op2) {                    \
+    return ap_private<1, false>(op1).operator BIN_OP(op2);                    \
+  }                                                                           \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
+  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                      \
+                  const _private_bit_ref<_AP_W2, _AP_S2>& op2) {              \
+    return op1.operator BIN_OP(ap_private<1, false>(op2));                    \
+  }
+
+#define OP_ASSIGN_MIX_BIT(ASSIGN_OP)                           \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
+  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(       \
+      ap_private<_AP_W1, _AP_S1>& op1,                         \
+      _private_bit_ref<_AP_W2, _AP_S2>& op2) {                 \
+    return op1.operator ASSIGN_OP(ap_private<1, false>(op2));  \
+  }                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
+  INLINE _private_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \
+      _private_bit_ref<_AP_W1, _AP_S1>& op1,                   \
+      ap_private<_AP_W2, _AP_S2>& op2) {                       \
+    ap_private<1, false> tmp(op1);                             \
+    tmp.operator ASSIGN_OP(op2);                               \
+    op1 = tmp;                                                 \
+    return op1;                                                \
+  }
+
+#define OP_REL_MIX_BIT(REL_OP)                                               \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,   \
+                              const ap_private<_AP_W2, _AP_S2>& op2) {       \
+    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);              \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,         \
+                              const _private_bit_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1.operator REL_OP(ap_private<1, false>(op2));                   \
+  }
+
+OP_ASSIGN_MIX_BIT(+=)
+OP_ASSIGN_MIX_BIT(-=)
+OP_ASSIGN_MIX_BIT(*=)
+OP_ASSIGN_MIX_BIT(/=)
+OP_ASSIGN_MIX_BIT(%=)
+OP_ASSIGN_MIX_BIT(&=)
+OP_ASSIGN_MIX_BIT(|=)
+OP_ASSIGN_MIX_BIT(^=)
+OP_ASSIGN_MIX_BIT(>>=)
+OP_ASSIGN_MIX_BIT(<<=)
+#undef OP_ASSIGN_MIX_BIT
+
+OP_BIN_MIX_BIT(+, plus)
+OP_BIN_MIX_BIT(-, minus)
+OP_BIN_MIX_BIT(*, mult)
+OP_BIN_MIX_BIT(/, div)
+OP_BIN_MIX_BIT(%, mod)
+OP_BIN_MIX_BIT(&, logic)
+OP_BIN_MIX_BIT(|, logic)
+OP_BIN_MIX_BIT(^, logic)
+OP_BIN_MIX_BIT(>>, arg1)
+OP_BIN_MIX_BIT(<<, arg1)
+#undef OP_BIN_MIX_BIT
+
+OP_REL_MIX_BIT(>)
+OP_REL_MIX_BIT(<)
+OP_REL_MIX_BIT(<=)
+OP_REL_MIX_BIT(>=)
+OP_REL_MIX_BIT(==)
+OP_REL_MIX_BIT(!=)
+#undef OP_REL_MIX_BIT
+
+#define REF_REL_OP_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(const _private_range_ref<_AP_W, _AP_S>& op,   \
+                              C_TYPE op2) {                                 \
+    return (ap_private<_AP_W, false>(op))                                   \
+        .                                                                   \
+        operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));                   \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(C_TYPE op2,                                   \
+                              const _private_range_ref<_AP_W, _AP_S>& op) { \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(                 \
+        ap_private<_AP_W, false>(op));                                      \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W, _AP_S>& op,     \
+                              C_TYPE op2) {                                 \
+    return (bool(op))REL_OP op2;                                            \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(C_TYPE op2,                                   \
+                              const _private_bit_ref<_AP_W, _AP_S>& op) {   \
+    return op2 REL_OP(bool(op));                                            \
+  }
+
+#define REF_REL_MIX_INT(C_TYPE, _AP_W2, _AP_S2)      \
+  REF_REL_OP_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_REL_OP_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_REL_OP_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_REL_MIX_INT(bool, 1, false)
+REF_REL_MIX_INT(char, 8, CHAR_IS_SIGNED)
+REF_REL_MIX_INT(signed char, 8, true)
+REF_REL_MIX_INT(unsigned char, 8, false)
+REF_REL_MIX_INT(short, sizeof(short) * 8, true)
+REF_REL_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+REF_REL_MIX_INT(int, sizeof(int) * 8, true)
+REF_REL_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+REF_REL_MIX_INT(long, sizeof(long) * 8, true)
+REF_REL_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+REF_REL_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+REF_REL_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef REF_REL_OP_MIX_INT
+#undef REF_REL_MIX_INT
+
+#define REF_BIN_OP_MIX_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)              \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& op,              \
+                      C_TYPE op2) {                                            \
+    return (ap_private<_AP_W, false>(op))                                      \
+        .                                                                      \
+        operator BIN_OP(ap_private<_AP_W2, _AP_S2>(op2));                      \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W2, _AP_S2>::template RType<_AP_W, false>::RTYPE \
+      operator BIN_OP(C_TYPE op2,                                              \
+                      const _private_range_ref<_AP_W, _AP_S>& op) {            \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator BIN_OP(                    \
+        ap_private<_AP_W, false>(op));                                         \
+  }
+
+#define REF_BIN_MIX_INT(C_TYPE, _AP_W2, _AP_S2)            \
+  REF_BIN_OP_MIX_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_MIX_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_MIX_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_MIX_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_MIX_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_MIX_INT(bool, 1, false)
+REF_BIN_MIX_INT(char, 8, CHAR_IS_SIGNED)
+REF_BIN_MIX_INT(signed char, 8, true)
+REF_BIN_MIX_INT(unsigned char, 8, false)
+REF_BIN_MIX_INT(short, sizeof(short) * 8, true)
+REF_BIN_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+REF_BIN_MIX_INT(int, sizeof(int) * 8, true)
+REF_BIN_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+REF_BIN_MIX_INT(long, sizeof(long) * 8, true)
+REF_BIN_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+REF_BIN_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+REF_BIN_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef REF_BIN_OP_MIX_INT
+#undef REF_BIN_MIX_INT
+
+#define REF_BIN_OP(BIN_OP, RTYPE)                                             \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_private<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
+      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& lhs,            \
+                      const _private_range_ref<_AP_W2, _AP_S2>& rhs) {        \
+    return ap_private<_AP_W, false>(lhs).operator BIN_OP(                     \
+        ap_private<_AP_W2, false>(rhs));                                      \
+  }
+
+REF_BIN_OP(+, plus)
+REF_BIN_OP(-, minus)
+REF_BIN_OP(*, mult)
+REF_BIN_OP(/, div)
+REF_BIN_OP(%, mod)
+REF_BIN_OP(&, logic)
+REF_BIN_OP(|, logic)
+REF_BIN_OP(^, logic)
+REF_BIN_OP(>>, arg1)
+REF_BIN_OP(<<, arg1)
+#undef REF_BIN_OP
+
+//************************************************************************
+//  Implement
+//      ap_private<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
+//  for operators  +, -, *, /, %, >>, <<, &, |, ^
+//  Without these operators the operands are converted to int64 and
+//  larger results lose informations (higher order bits).
+//
+//                       operand OP
+//                      /          |
+//              left-concat        right-concat
+//                /     |           /         |
+//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>   <RW2,RT2>
+//
+//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
+//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
+//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
+//
+//  In Verilog 2001 result of concatenation is always unsigned even
+//  when both sides are signed.
+//************************************************************************
+
+#endif // ifndef __AP_PRIVATE_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/hls_stream.h b/hls4ml/templates/vivado/ap_types/hls_stream.h
index f516c39e08..317125d351 100644
--- a/hls4ml/templates/vivado/ap_types/hls_stream.h
+++ b/hls4ml/templates/vivado/ap_types/hls_stream.h
@@ -1,263 +1,263 @@
-/*
-#-  (c) Copyright 2011-2018 Xilinx, Inc. All rights reserved.
-#-
-#-  This file contains confidential and proprietary information
-#-  of Xilinx, Inc. and is protected under U.S. and
-#-  international copyright and other intellectual property
-#-  laws.
-#-
-#-  DISCLAIMER
-#-  This disclaimer is not a license and does not grant any
-#-  rights to the materials distributed herewith. Except as
-#-  otherwise provided in a valid license issued to you by
-#-  Xilinx, and to the maximum extent permitted by applicable
-#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
-#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
-#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
-#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
-#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
-#-  (2) Xilinx shall not be liable (whether in contract or tort,
-#-  including negligence, or under any other theory of
-#-  liability) for any loss or damage of any kind or nature
-#-  related to, arising under or in connection with these
-#-  materials, including for any direct, or any indirect,
-#-  special, incidental, or consequential loss or damage
-#-  (including loss of data, profits, goodwill, or any type of
-#-  loss or damage suffered as a result of any action brought
-#-  by a third party) even if such damage or loss was
-#-  reasonably foreseeable or Xilinx had been advised of the
-#-  possibility of the same.
-#-
-#-  CRITICAL APPLICATIONS
-#-  Xilinx products are not designed or intended to be fail-
-#-  safe, or for use in any application requiring fail-safe
-#-  performance, such as life-support or safety devices or
-#-  systems, Class III medical devices, nuclear facilities,
-#-  applications related to the deployment of airbags, or any
-#-  other applications that could lead to death, personal
-#-  injury, or severe property or environmental damage
-#-  (individually and collectively, "Critical
-#-  Applications"). Customer assumes the sole risk and
-#-  liability of any use of Xilinx products in Critical
-#-  Applications, subject only to applicable laws and
-#-  regulations governing limitations on product liability.
-#-
-#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
-#-  PART OF THIS FILE AT ALL TIMES. 
-#- ************************************************************************
-
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#ifndef X_HLS_STREAM_SIM_H
-#define X_HLS_STREAM_SIM_H
-
-/*
- * This file contains a C++ model of hls::stream.
- * It defines C simulation model.
- */
-#ifndef __cplusplus
-
-#error C++ is required to include this header file
-
-#else
-
-//////////////////////////////////////////////
-// C level simulation models for hls::stream
-//////////////////////////////////////////////
-#include <queue>
-#include <iostream>
-#include <typeinfo>
-#include <string>
-#include <sstream>
-
-#ifdef HLS_STREAM_THREAD_SAFE
-#include <mutex>
-#include <condition_variable>
-#endif
-
-#ifndef _MSC_VER
-#include <cxxabi.h>
-#include <stdlib.h>
-#endif
-
-namespace hls {
-
-template<typename __STREAM_T__>
-class stream
-{
-  protected:
-    std::string _name;
-    std::deque<__STREAM_T__> _data; // container for the elements
-#ifdef HLS_STREAM_THREAD_SAFE
-    std::mutex _mutex;
-    std::condition_variable _condition_var;
-#endif    
-
-  public:
-    /// Constructors
-    // Keep consistent with the synthesis model's constructors
-    stream() {
-        static unsigned _counter = 1;
-        std::stringstream ss;
-#ifndef _MSC_VER
-        char* _demangle_name = abi::__cxa_demangle(typeid(*this).name(), 0, 0, 0);
-        if (_demangle_name) {
-            _name = _demangle_name;
-            free(_demangle_name);
-        }
-        else {
-            _name = "hls_stream";
-        }
-#else
-        _name = typeid(*this).name();
-#endif
-
-        ss << _counter++;
-        _name += "." + ss.str();
-    }
-
-    stream(const std::string name) {
-    // default constructor,
-    // capacity set to predefined maximum
-        _name = name;
-    }
-
-  /// Make copy constructor and assignment operator private
-  private:
-    stream(const stream< __STREAM_T__ >& chn):
-        _name(chn._name), _data(chn._data) {
-    }
-
-    stream& operator = (const stream< __STREAM_T__ >& chn) {
-        _name = chn._name;
-        _data = chn._data;
-        return *this;
-    }
-
-  public:
-    /// Overload >> and << operators to implement read() and write()
-    void operator >> (__STREAM_T__& rdata) {
-        read(rdata);
-    }
-
-    void operator << (const __STREAM_T__& wdata) {
-        write(wdata);
-    }
-
-
-  public:
-    /// Destructor
-    /// Check status of the queue
-    virtual ~stream() {
-        if (!_data.empty())
-        {
-            std::cout << "WARNING: Hls::stream '" 
-                      << _name 
-                      << "' contains leftover data,"
-                      << " which may result in RTL simulation hanging."
-                      << std::endl;
-        }
-    }
-
-    /// Status of the queue
-    bool empty() {
-#ifdef HLS_STREAM_THREAD_SAFE
-        std::lock_guard<std::mutex> lg(_mutex);
-#endif
-        return _data.empty();
-    }    
-
-    bool full() const { return false; }
-
-    /// Blocking read
-    void read(__STREAM_T__& head) {
-        head = read();
-    }
-
-#ifdef HLS_STREAM_THREAD_SAFE
-    __STREAM_T__ read() {
-        std::unique_lock<std::mutex> ul(_mutex);
-        while (_data.empty()) {
-            _condition_var.wait(ul);
-        }
-
-        __STREAM_T__ elem;
-        elem = _data.front();
-        _data.pop_front();
-        return elem;
-    }
-#else
-    __STREAM_T__ read() {
-        __STREAM_T__ elem;
-        if (_data.empty()) {
-            std::cout << "WARNING: Hls::stream '"
-                      << _name 
-                      << "' is read while empty,"
-                      << " which may result in RTL simulation hanging."
-                      << std::endl;
-            elem = __STREAM_T__();
-        } else {
-            elem = _data.front();
-            _data.pop_front();
-        }
-        return elem;
-    }
-#endif
-
-    /// Blocking write
-    void write(const __STREAM_T__& tail) { 
-#ifdef HLS_STREAM_THREAD_SAFE
-        std::unique_lock<std::mutex> ul(_mutex);
-#endif
-        _data.push_back(tail);
-#ifdef HLS_STREAM_THREAD_SAFE
-        _condition_var.notify_one();
-#endif
-    }
-
-    /// Nonblocking read
-    bool read_nb(__STREAM_T__& head) {
-#ifdef HLS_STREAM_THREAD_SAFE
-        std::lock_guard<std::mutex> lg(_mutex);
-#endif    
-        bool is_empty = _data.empty();
-        if (is_empty) {
-            head = __STREAM_T__();
-        } else {
-            __STREAM_T__ elem(_data.front());
-            _data.pop_front();
-            head = elem;
-        }
-        return !is_empty;
-    }
-
-    /// Nonblocking write
-    bool write_nb(const __STREAM_T__& tail) {
-        bool is_full = full();
-        write(tail);
-        return !is_full;
-    }
-
-    /// Fifo size
-    size_t size() {
-        return _data.size();
-    }
-};
-
-} // namespace hls
-
-#endif // __cplusplus
-#endif  // X_HLS_STREAM_SIM_H
-
+/*
+#-  (c) Copyright 2011-2018 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef X_HLS_STREAM_SIM_H
+#define X_HLS_STREAM_SIM_H
+
+/*
+ * This file contains a C++ model of hls::stream.
+ * It defines C simulation model.
+ */
+#ifndef __cplusplus
+
+#error C++ is required to include this header file
+
+#else
+
+//////////////////////////////////////////////
+// C level simulation models for hls::stream
+//////////////////////////////////////////////
+#include <queue>
+#include <iostream>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#ifdef HLS_STREAM_THREAD_SAFE
+#include <mutex>
+#include <condition_variable>
+#endif
+
+#ifndef _MSC_VER
+#include <cxxabi.h>
+#include <stdlib.h>
+#endif
+
+namespace hls {
+
+template<typename __STREAM_T__>
+class stream
+{
+  protected:
+    std::string _name;
+    std::deque<__STREAM_T__> _data; // container for the elements
+#ifdef HLS_STREAM_THREAD_SAFE
+    std::mutex _mutex;
+    std::condition_variable _condition_var;
+#endif    
+
+  public:
+    /// Constructors
+    // Keep consistent with the synthesis model's constructors
+    stream() {
+        static unsigned _counter = 1;
+        std::stringstream ss;
+#ifndef _MSC_VER
+        char* _demangle_name = abi::__cxa_demangle(typeid(*this).name(), 0, 0, 0);
+        if (_demangle_name) {
+            _name = _demangle_name;
+            free(_demangle_name);
+        }
+        else {
+            _name = "hls_stream";
+        }
+#else
+        _name = typeid(*this).name();
+#endif
+
+        ss << _counter++;
+        _name += "." + ss.str();
+    }
+
+    stream(const std::string name) {
+    // default constructor,
+    // capacity set to predefined maximum
+        _name = name;
+    }
+
+  /// Make copy constructor and assignment operator private
+  private:
+    stream(const stream< __STREAM_T__ >& chn):
+        _name(chn._name), _data(chn._data) {
+    }
+
+    stream& operator = (const stream< __STREAM_T__ >& chn) {
+        _name = chn._name;
+        _data = chn._data;
+        return *this;
+    }
+
+  public:
+    /// Overload >> and << operators to implement read() and write()
+    void operator >> (__STREAM_T__& rdata) {
+        read(rdata);
+    }
+
+    void operator << (const __STREAM_T__& wdata) {
+        write(wdata);
+    }
+
+
+  public:
+    /// Destructor
+    /// Check status of the queue
+    virtual ~stream() {
+        if (!_data.empty())
+        {
+            std::cout << "WARNING: Hls::stream '" 
+                      << _name 
+                      << "' contains leftover data,"
+                      << " which may result in RTL simulation hanging."
+                      << std::endl;
+        }
+    }
+
+    /// Status of the queue
+    bool empty() {
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::lock_guard<std::mutex> lg(_mutex);
+#endif
+        return _data.empty();
+    }    
+
+    bool full() const { return false; }
+
+    /// Blocking read
+    void read(__STREAM_T__& head) {
+        head = read();
+    }
+
+#ifdef HLS_STREAM_THREAD_SAFE
+    __STREAM_T__ read() {
+        std::unique_lock<std::mutex> ul(_mutex);
+        while (_data.empty()) {
+            _condition_var.wait(ul);
+        }
+
+        __STREAM_T__ elem;
+        elem = _data.front();
+        _data.pop_front();
+        return elem;
+    }
+#else
+    __STREAM_T__ read() {
+        __STREAM_T__ elem;
+        if (_data.empty()) {
+            std::cout << "WARNING: Hls::stream '"
+                      << _name 
+                      << "' is read while empty,"
+                      << " which may result in RTL simulation hanging."
+                      << std::endl;
+            elem = __STREAM_T__();
+        } else {
+            elem = _data.front();
+            _data.pop_front();
+        }
+        return elem;
+    }
+#endif
+
+    /// Blocking write
+    void write(const __STREAM_T__& tail) { 
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::unique_lock<std::mutex> ul(_mutex);
+#endif
+        _data.push_back(tail);
+#ifdef HLS_STREAM_THREAD_SAFE
+        _condition_var.notify_one();
+#endif
+    }
+
+    /// Nonblocking read
+    bool read_nb(__STREAM_T__& head) {
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::lock_guard<std::mutex> lg(_mutex);
+#endif    
+        bool is_empty = _data.empty();
+        if (is_empty) {
+            head = __STREAM_T__();
+        } else {
+            __STREAM_T__ elem(_data.front());
+            _data.pop_front();
+            head = elem;
+        }
+        return !is_empty;
+    }
+
+    /// Nonblocking write
+    bool write_nb(const __STREAM_T__& tail) {
+        bool is_full = full();
+        write(tail);
+        return !is_full;
+    }
+
+    /// Fifo size
+    size_t size() {
+        return _data.size();
+    }
+};
+
+} // namespace hls
+
+#endif // __cplusplus
+#endif  // X_HLS_STREAM_SIM_H
+
diff --git a/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h b/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h
index 3e751c36bf..759000ed78 100644
--- a/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h
+++ b/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h
@@ -1,80 +1,80 @@
-#ifndef X_HLS_UTILS_H
-#define X_HLS_UTILS_H
-#include "ap_fixed.h"
-#include <limits>
-
-namespace hls {
-
-    template<typename T>
-    class numeric_limits {
-    public:
-        static T max()     { return std::numeric_limits<T>::max(); }
-        static T min()     { return std::numeric_limits<T>::min(); }
-        static T epsilon() { return std::numeric_limits<T>::epsilon(); }
-    };
-
-    template <int W, int I, ap_q_mode Q, ap_o_mode O>
-    class numeric_limits<ap_fixed<W,I,Q,O> > {
-    public:
-        static ap_fixed<W,I,Q,O> max() {
-            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::max();
-            ap_fixed<W,I,Q,O> x;
-            x(W-1,0) = m(W-1,0);
-            return x;
-        }
-        static ap_fixed<W,I,Q,O> min() {
-            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::min();
-            ap_fixed<W,I,Q,O> x;
-            x(W-1,0) = m(W-1,0);
-            return x;
-        }
-        static ap_fixed<W,I,Q,O> epsilon() {
-          ap_fixed<W,I,Q,O> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-
-    template <int W, int I, ap_q_mode Q, ap_o_mode O>
-    class numeric_limits<ap_ufixed<W,I,Q,O> > {
-    public:
-        static ap_ufixed<W,I,Q,O> max() {
-            ap_uint<W> m = ::hls::numeric_limits<ap_uint<W> >::max();
-            ap_ufixed<W,I,Q,O> x;
-            x(W-1,0) = m(W-1,0);
-            return x;
-        }
-        static ap_ufixed<W,I,Q,O> min() { return 0; }
-        static ap_ufixed<W,I,Q,O> epsilon() {
-          ap_ufixed<W,I,Q,O> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-
-    template <int W>
-    class numeric_limits<ap_int<W> > {
-    public:
-        static ap_int<W> max() { ap_int<W> m = min(); return ~m; }
-        static ap_int<W> min() { ap_int<W> m = 0; m[W-1] = 1; return m; }
-        static ap_int<W> epsilon() {
-          ap_int<W> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-
-    template <int W>
-    class numeric_limits<ap_uint<W> > {
-    public:
-        static ap_uint<W> max() { ap_uint<W> zero = 0; return ~zero; }
-        static ap_uint<W> min() { return 0; }
-        static ap_uint<W> epsilon() {
-          ap_uint<W> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-}
-
-#endif
+#ifndef X_HLS_UTILS_H
+#define X_HLS_UTILS_H
+#include "ap_fixed.h"
+#include <limits>
+
+namespace hls {
+
+    template<typename T>
+    class numeric_limits {
+    public:
+        static T max()     { return std::numeric_limits<T>::max(); }
+        static T min()     { return std::numeric_limits<T>::min(); }
+        static T epsilon() { return std::numeric_limits<T>::epsilon(); }
+    };
+
+    template <int W, int I, ap_q_mode Q, ap_o_mode O>
+    class numeric_limits<ap_fixed<W,I,Q,O> > {
+    public:
+        static ap_fixed<W,I,Q,O> max() {
+            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::max();
+            ap_fixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_fixed<W,I,Q,O> min() {
+            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::min();
+            ap_fixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_fixed<W,I,Q,O> epsilon() {
+          ap_fixed<W,I,Q,O> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W, int I, ap_q_mode Q, ap_o_mode O>
+    class numeric_limits<ap_ufixed<W,I,Q,O> > {
+    public:
+        static ap_ufixed<W,I,Q,O> max() {
+            ap_uint<W> m = ::hls::numeric_limits<ap_uint<W> >::max();
+            ap_ufixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_ufixed<W,I,Q,O> min() { return 0; }
+        static ap_ufixed<W,I,Q,O> epsilon() {
+          ap_ufixed<W,I,Q,O> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W>
+    class numeric_limits<ap_int<W> > {
+    public:
+        static ap_int<W> max() { ap_int<W> m = min(); return ~m; }
+        static ap_int<W> min() { ap_int<W> m = 0; m[W-1] = 1; return m; }
+        static ap_int<W> epsilon() {
+          ap_int<W> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W>
+    class numeric_limits<ap_uint<W> > {
+    public:
+        static ap_uint<W> max() { ap_uint<W> zero = 0; return ~zero; }
+        static ap_uint<W> min() { return 0; }
+        static ap_uint<W> epsilon() {
+          ap_uint<W> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+}
+
+#endif
diff --git a/hls4ml/templates/vivado/build_lib.sh b/hls4ml/templates/vivado/build_lib.sh
index 19f2d0a1c8..68e3538bcb 100755
--- a/hls4ml/templates/vivado/build_lib.sh
+++ b/hls4ml/templates/vivado/build_lib.sh
@@ -1,17 +1,17 @@
-#!/bin/bash
-
-CC=g++
-if [[ "$OSTYPE" == "linux-gnu" ]]; then
-    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
-elif [[ "$OSTYPE" == "darwin"* ]]; then
-    CFLAGS="-O3 -fPIC -std=c++11"
-fi
-LDFLAGS=
-INCFLAGS="-Ifirmware/ap_types/"
-PROJECT=myproject
-LIB_STAMP=mystamp
-
-${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
-${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
-${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
-rm -f *.o
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+LDFLAGS=
+INCFLAGS="-Ifirmware/ap_types/"
+PROJECT=myproject
+LIB_STAMP=mystamp
+
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/vivado/firmware/myproject.cpp b/hls4ml/templates/vivado/firmware/myproject.cpp
index 5ba7f118ba..74b58c5cb1 100644
--- a/hls4ml/templates/vivado/firmware/myproject.cpp
+++ b/hls4ml/templates/vivado/firmware/myproject.cpp
@@ -1,23 +1,23 @@
-#include <iostream>
-
-#include "myproject.h"
-#include "parameters.h"
-
-// hls-fpga-machine-learning insert namespace-start
-
-void myproject(
-    // hls-fpga-machine-learning insert header
-) {
-
-    // hls-fpga-machine-learning insert IO
-
-    // hls-fpga-machine-learning insert load weights
-
-    // ****************************************
-    // NETWORK INSTANTIATION
-    // ****************************************
-
-    // hls-fpga-machine-learning insert layers
-}
-
-// hls-fpga-machine-learning insert namespace-end
+#include <iostream>
+
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+void myproject(
+    // hls-fpga-machine-learning insert header
+) {
+
+    // hls-fpga-machine-learning insert IO
+
+    // hls-fpga-machine-learning insert load weights
+
+    // ****************************************
+    // NETWORK INSTANTIATION
+    // ****************************************
+
+    // hls-fpga-machine-learning insert layers
+}
+
+// hls-fpga-machine-learning insert namespace-end
diff --git a/hls4ml/templates/vivado/firmware/myproject.h b/hls4ml/templates/vivado/firmware/myproject.h
index 5b34ae4c02..a56778976b 100644
--- a/hls4ml/templates/vivado/firmware/myproject.h
+++ b/hls4ml/templates/vivado/firmware/myproject.h
@@ -1,19 +1,19 @@
-#ifndef MYPROJECT_H_
-#define MYPROJECT_H_
-
-#include "ap_fixed.h"
-#include "ap_int.h"
-#include "hls_stream.h"
-
-#include "defines.h"
-
-// hls-fpga-machine-learning insert namespace-start
-
-// Prototype of top level function for C-synthesis
-void myproject(
-    // hls-fpga-machine-learning insert header
-);
-
-// hls-fpga-machine-learning insert namespace-end
-
-#endif
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#include "ap_fixed.h"
+#include "ap_int.h"
+#include "hls_stream.h"
+
+#include "defines.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+// Prototype of top level function for C-synthesis
+void myproject(
+    // hls-fpga-machine-learning insert header
+);
+
+// hls-fpga-machine-learning insert namespace-end
+
+#endif
diff --git a/hls4ml/templates/vivado/myproject_test.cpp b/hls4ml/templates/vivado/myproject_test.cpp
index 814bb1f3e6..29a4c816e5 100644
--- a/hls4ml/templates/vivado/myproject_test.cpp
+++ b/hls4ml/templates/vivado/myproject_test.cpp
@@ -1,94 +1,94 @@
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
-#include "firmware/myproject.h"
-#include "firmware/nnet_utils/nnet_helpers.h"
-
-// hls-fpga-machine-learning insert bram
-
-#define CHECKPOINT 5000
-
-namespace nnet {
-bool trace_enabled = true;
-std::map<std::string, void *> *trace_outputs = NULL;
-size_t trace_type_size = sizeof(double);
-} // namespace nnet
-
-int main(int argc, char **argv) {
-    // hls-fpga-machine-learning insert namespace
-
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
-#ifdef RTL_SIM
-    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
-#else
-    std::string RESULTS_LOG = "tb_data/csim_results.log";
-#endif
-    std::ofstream fout(RESULTS_LOG);
-
-    std::string iline;
-    std::string pline;
-    int e = 0;
-
-    if (fin.is_open() && fpr.is_open()) {
-        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
-            if (e % CHECKPOINT == 0)
-                std::cout << "Processing input " << e << std::endl;
-            char *cstr = const_cast<char *>(iline.c_str());
-            char *current;
-            std::vector<float> in;
-            current = strtok(cstr, " ");
-            while (current != NULL) {
-                in.push_back(atof(current));
-                current = strtok(NULL, " ");
-            }
-            cstr = const_cast<char *>(pline.c_str());
-            std::vector<float> pr;
-            current = strtok(cstr, " ");
-            while (current != NULL) {
-                pr.push_back(atof(current));
-                current = strtok(NULL, " ");
-            }
-
-            // hls-fpga-machine-learning insert data
-
-            // hls-fpga-machine-learning insert top-level-function
-
-            if (e % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-            }
-            e++;
-
-            // hls-fpga-machine-learning insert tb-output
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
-
-        // hls-fpga-machine-learning insert zero
-
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert output
-
-        // hls-fpga-machine-learning insert tb-output
-    }
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
-    return 0;
-}
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+int main(int argc, char **argv) {
+    // hls-fpga-machine-learning insert namespace
+
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+#ifdef RTL_SIM
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+#else
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
+#endif
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+
+            // hls-fpga-machine-learning insert data
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+
+        // hls-fpga-machine-learning insert zero
+
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert output
+
+        // hls-fpga-machine-learning insert tb-output
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index cfe169f123..6b4e0d4b91 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -1,795 +1,795 @@
-#ifndef NNET_ACTIVATION_H_
-#define NNET_ACTIVATION_H_
-
-#include "ap_fixed.h"
-#include "nnet_common.h"
-#include <cmath>
-
-namespace nnet {
-
-struct activ_config {
-    // IO size
-    static const unsigned n_in = 10;
-
-    // Internal info
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-
-    // Internal data type definitions
-    typedef ap_fixed<18, 8> table_t;
-};
-
-// *************************************************
-//       LINEAR Activation -- See Issue 53
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        res[ii] = data[ii];
-    }
-}
-
-// *************************************************
-//       RELU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            res[ii] = datareg;
-        else
-            res[ii] = 0;
-    }
-}
-
-template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg < 0)
-            res[ii] = 0;
-        else if (datareg > MAX_INT)
-            res[ii] = MAX_INT;
-        else
-            res[ii] = datareg;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
-}
-
-// *************************************************
-//       Sigmoid Activation
-// *************************************************
-inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
-
-template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default logistic sigmoid function:
-    //   result = 1/(1+e^(-x))
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
-        initialized = true;
-    }
-
-    #pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 16;
-        index = data_round + 8 * CONFIG_T::table_size / 16;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)sigmoid_table[index];
-    }
-}
-
-// *************************************************
-//       Softmax Activation
-// *************************************************
-
-enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
-
-inline float exp_fcn_float(float input) { return std::exp(input); }
-
-template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
-    // Treat the index as the top N bits
-    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
-    data_T x(0);
-    x(x.width - 1, x.width - N) = i;
-    return (float)x;
-}
-
-template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
-    // Slice the top N bits to get an index into the table
-    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
-    ap_uint<N> y = x(x.width - 1, x.width - N);              // slice the top N bits of input
-    return (unsigned)y(N - 1, 0);
-}
-
-template <class data_T, typename CONFIG_T>
-void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
-    // The template data_T is the data type used to address the table
-    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
-        // Slicing bits for address is going to round towards 0, so take the central value
-        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
-        typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
-        table_out[i] = exp_x;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
-    // The template data_T is the data type used to address the table
-    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
-        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
-        typename CONFIG_T::inv_table_t inv_x = 1.0 / x;
-        table_out[i] = inv_x;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS pipeline
-    // Initialize the lookup tables
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-
-#endif
-    if (!initialized) {
-        // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<data_T, CONFIG_T>(exp_table);
-        // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
-        initialized = true;
-    }
-
-    // Calculate all the e^x's
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
-    #pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::exp_table_t exp_sum(0);
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        #pragma HLS unroll
-        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
-        exp_res[i] = exp_table[x];
-    }
-
-    // Explicitly sum the results with an adder tree.
-    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
-
-    typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        #pragma HLS unroll
-        res[i] = exp_res[i] * inv_exp_sum;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS pipeline
-    // Initialize the lookup tables
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-
-#endif
-    if (!initialized) {
-        // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<data_T, CONFIG_T>(exp_table);
-        // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
-        initialized = true;
-    }
-
-    // Find the max and compute all delta(x_i, x_max)
-    Op_max<data_T> op_max;
-    data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
-
-    // For the diffs, use the same type as the input but force rounding and saturation
-    ap_fixed<data_T::width, data_T::iwidth, AP_RND, AP_SAT> d_xi_xmax[CONFIG_T::n_in];
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        #pragma HLS unroll
-        d_xi_xmax[i] = data[i] - x_max;
-    }
-
-    // Calculate all the e^x's
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
-    #pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::exp_table_t exp_sum(0);
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        #pragma HLS unroll
-        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
-        exp_res[i] = exp_table[x];
-    }
-
-    // Explicitly sum the results with an adder tree.
-    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
-
-    typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        #pragma HLS unroll
-        res[i] = exp_res[i] * inv_exp_sum;
-    }
-}
-
-template<typename CONFIG_T, int N_TABLE>
-void init_exp_table_legacy(typename CONFIG_T::exp_table_t table_out[N_TABLE])
-{
-    float exp_range = (float) CONFIG_T::exp_range;
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
-        //std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template<typename CONFIG_T, int N_TABLE>
-void init_invert_table_legacy(typename CONFIG_T::inv_table_t table_out[N_TABLE])
-{
-    float inv_range = (float) CONFIG_T::inv_range;
-    // Inversion function:
-    //   result = 1/x
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        float in_val = inv_range*ii/float(N_TABLE);
-        if (in_val > 0.0) table_out[ii] = 1.0/in_val;
-        else table_out[ii] = 0.0;
-    }
-}
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void  softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in])
-{
-	#pragma HLS pipeline
-    int exp_range = CONFIG_T::exp_range;
-    int inv_range = CONFIG_T::inv_range;
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
-        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
-        initialized = true;
-    }
-    
-    // Index into the lookup table based on data for exponentials
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];// different, independent, fixed point precision
-    typename CONFIG_T::exp_table_t exp_diff_res;// different, independent, fixed point precision
-    typename CONFIG_T::exp_table_t data_cache[CONFIG_T::n_in];
-    int data_round;
-    int index;
-
-    // std::cout << "input to SM: " << std::endl;              /////
-    // nnet::print_result<data_T, CONFIG_T::n_in>(data, std::cout);  /////
-    // std::cout << " " << std::endl;   /////
-
-#pragma HLS array_partition variable=data_cache complete
-
-
-    typename CONFIG_T::accum_t denominator;
-    typename CONFIG_T::inv_table_t deno_inver;
-
-    denominator = 0;
-    for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-		data_round = data[ii]*(CONFIG_T::table_size/(exp_range*2));
-        // std::cout << " data, round: " << data[ii] << " " << data_round << std::endl;  /////
-		index = data_round + exp_range*(CONFIG_T::table_size/(exp_range*2));
-        // std::cout << " index: " << index;   /////
-		if (index < 0)   index = 0;
-		if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-		denominator += exp_table[index];
-        // std::cout << "   denominator " << index << std::endl;   /////
-        // std::cout << "   denominator " << denominator << std::endl;   /////
-		data_cache[ii] = exp_table[index];
-    }
-    // std::cout << "end  " << std::endl;    /////
-
-
-    //using lookup table for inverse
-	int exp_res_index = denominator*(CONFIG_T::table_size/inv_range);
-    
-    // std::cout << " denominator: " << denominator << std::endl;  /////
-    // std::cout << " table_size: " << CONFIG_T::table_size << std::endl;  /////
-    // std::cout << " inv_range: " << inv_range << std::endl;  /////
-    // std::cout << " exp_res_index: " << exp_res_index << std::endl;  /////
-	if (exp_res_index < 0)   exp_res_index = 0;
-	if (exp_res_index > CONFIG_T::table_size-1) exp_res_index = CONFIG_T::table_size-1;
-	deno_inver = invert_table[exp_res_index];
-    // std::cout << " deno_inver: " << deno_inver << std::endl;  /////
-
-	for (int ii=0; ii<CONFIG_T::n_in; ii++) {
-		res[ii] = (res_T) (data_cache[ii]*deno_inver);
-	}
-
-
-}
-
-template<class data_T, class res_T, typename CONFIG_T>
-void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]){
-    #pragma HLS inline
-    switch (CONFIG_T::implementation) {
-    case softmax_implementation::latency:
-        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    case softmax_implementation::stable:
-        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    case softmax_implementation::legacy:
-        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    case softmax_implementation::argmax:
-        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    }
-}
-
-// *************************************************
-//       TanH Activation
-// *************************************************
-template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Implement tanh lookup
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
-        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = tanh(in_val);
-        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
-        // std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
-        initialized = true;
-    }
-
-    #pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 8;
-        index = data_round + 4 * CONFIG_T::table_size / 8;
-        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)tanh_table[index];
-    }
-}
-
-// *************************************************
-//       UnaryLUT Activation
-// *************************************************
-template <int table_size, class data_T> inline unsigned get_index_unary_lut(data_T x) {
-    // Slice the top N bits to get an index into the table
-    static constexpr int N = ceillog2(table_size);
-    return (unsigned)(x(x.width - 1, 0));
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void unary_lut(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               typename CONFIG_T::table_t table[CONFIG_T::table_size]) {
-    #pragma HLS function_instantiate variable=table
-    #pragma HLS ARRAY_PARTITION variable=table
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        #pragma HLS UNROLL
-        unsigned index = get_index_unary_lut<CONFIG_T::table_size>(data[ii]);
-        res[ii] = (res_T)table[index];
-    }
-}
-
-// *************************************************
-//       Hard sigmoid Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
-        if (datareg > 1)
-            datareg = 1;
-        else if (datareg < 0)
-            datareg = 0;
-        res[ii] = datareg;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    if (CONFIG_T::io_type == io_parallel) {
-        #pragma HLS PIPELINE
-    }
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
-        if (sigmoid > 1)
-            sigmoid = 1;
-        else if (sigmoid < 0)
-            sigmoid = 0;
-        res[ii] = 2 * sigmoid - 1;
-    }
-}
-
-// *************************************************
-//       Leaky RELU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            res[ii] = datareg;
-        else
-            res[ii] = alpha * datareg;
-    }
-}
-
-// *************************************************
-//       Thresholded RELU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > theta)
-            res[ii] = datareg;
-        else
-            res[ii] = 0;
-    }
-}
-
-// *************************************************
-//       Softplus Activation
-// *************************************************
-inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
-
-template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default softplus function:
-    //   result = log(exp(x) + 1)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
-        initialized = true;
-    }
-
-    #pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 16;
-        index = data_round + 8 * CONFIG_T::table_size / 16;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)softplus_table[index];
-    }
-}
-
-// *************************************************
-//       Softsign Activation
-// *************************************************
-inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
-
-template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default softsign function:
-    //   result = x / (abs(x) + 1)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
-        initialized = true;
-    }
-
-    #pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 16;
-        index = data_round + 8 * CONFIG_T::table_size / 16;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)softsign_table[index];
-    }
-}
-
-// *************************************************
-//       ELU Activation
-// *************************************************
-inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
-
-template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default ELU function:
-    //   result = alpha * (e^(x) - 1)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
-        float in_val = -8.0 * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
-        initialized = true;
-    }
-
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    // Index into the lookup table based on data
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg >= 0) {
-            res[ii] = datareg;
-        } else {
-            index = datareg * CONFIG_T::table_size / -8;
-            if (index > CONFIG_T::table_size - 1)
-                index = CONFIG_T::table_size - 1;
-            res[ii] = alpha * elu_table[index];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
-}
-
-// *************************************************
-//       SELU Activation
-// *************************************************
-inline float selu_fcn_float(float input) {
-    return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
-}
-
-template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default SELU function:
-    //   result = 1.05 * (1.673 * (e^(x) - 1))
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
-        float in_val = -8.0 * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
-        initialized = true;
-    }
-
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    // Index into the lookup table based on data
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg >= 0) {
-            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
-        } else {
-            index = datareg * CONFIG_T::table_size / -8;
-            if (index > CONFIG_T::table_size - 1)
-                index = CONFIG_T::table_size - 1;
-            res[ii] = selu_table[index];
-        }
-    }
-}
-
-// *************************************************
-//       PReLU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            res[ii] = datareg;
-        else
-            res[ii] = alpha[ii] * datareg;
-    }
-}
-
-// *************************************************
-//       Binary TanH Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    res_T cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            cache = 1;
-        else
-            cache = -1;
-
-        res[ii] = (res_T)cache;
-    }
-}
-
-// *************************************************
-//       Ternary TanH Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS PIPELINE
-
-    data_T datareg;
-    res_T cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = 2 * data[ii];
-        if (datareg > 1)
-            cache = 1;
-        else if (datareg > -1 && datareg <= 1)
-            cache = 0;
-        else
-            cache = -1;
-
-        res[ii] = (res_T)cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_ACTIVATION_H_
+#define NNET_ACTIVATION_H_
+
+#include "ap_fixed.h"
+#include "nnet_common.h"
+#include <cmath>
+
+namespace nnet {
+
+struct activ_config {
+    // IO size
+    static const unsigned n_in = 10;
+
+    // Internal info
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef ap_fixed<18, 8> table_t;
+};
+
+// *************************************************
+//       LINEAR Activation -- See Issue 53
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = data[ii];
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
+
+template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default logistic sigmoid function:
+    //   result = 1/(1+e^(-x))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)sigmoid_table[index];
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
+
+inline float exp_fcn_float(float input) { return std::exp(input); }
+
+template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
+    // Treat the index as the top N bits
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    data_T x(0);
+    x(x.width - 1, x.width - N) = i;
+    return (float)x;
+}
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    ap_uint<N> y = x(x.width - 1, x.width - N);              // slice the top N bits of input
+    return (unsigned)y(N - 1, 0);
+}
+
+template <class data_T, typename CONFIG_T>
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        // Slicing bits for address is going to round towards 0, so take the central value
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
+        table_out[i] = exp_x;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::inv_table_t inv_x = 1.0 / x;
+        table_out[i] = inv_x;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Find the max and compute all delta(x_i, x_max)
+    Op_max<data_T> op_max;
+    data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+
+    // For the diffs, use the same type as the input but force rounding and saturation
+    ap_fixed<data_T::width, data_T::iwidth, AP_RND, AP_SAT> d_xi_xmax[CONFIG_T::n_in];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    #pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::exp_table_t table_out[N_TABLE]) {
+    float exp_range = (float)CONFIG_T::exp_range;
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::inv_table_t table_out[N_TABLE]) {
+    float inv_range = (float)CONFIG_T::inv_range;
+    // Inversion function:
+    //   result = 1/x
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        float in_val = inv_range * ii / float(N_TABLE);
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / in_val;
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS pipeline
+    int exp_range = CONFIG_T::exp_range;
+    int inv_range = CONFIG_T::inv_range;
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
+    typename CONFIG_T::exp_table_t exp_diff_res;            // different, independent, fixed point precision
+    typename CONFIG_T::exp_table_t data_cache[CONFIG_T::n_in];
+    int data_round;
+    int index;
+
+    //    std::cout << "input to SM: " << std::endl;              /////
+    //    nnet::print_result<data_T, CONFIG_T::n_in>(data, std::cout);  /////
+    //    std::cout << " " << std::endl;   /////
+
+    #pragma HLS array_partition variable=data_cache complete
+
+    typename CONFIG_T::accum_t denominator;
+    typename CONFIG_T::inv_table_t deno_inver;
+
+    denominator = 0;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * (CONFIG_T::table_size / (exp_range * 2));
+        // std::cout << " data, round: " << data[ii] << " " << data_round << std::endl;  /////
+        index = data_round + exp_range * (CONFIG_T::table_size / (exp_range * 2));
+        // std::cout << " index: " << index;   /////
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        denominator += exp_table[index];
+        // std::cout << "   denominator " << index << std::endl;   /////
+        // std::cout << "   denominator " << denominator << std::endl;   /////
+        data_cache[ii] = exp_table[index];
+    }
+    // std::cout << "end  " << std::endl;    /////
+
+    // using lookup table for inverse
+    int exp_res_index = denominator * (CONFIG_T::table_size / inv_range);
+
+    // std::cout << " denominator: " << denominator << std::endl;  /////
+    // std::cout << " table_size: " << CONFIG_T::table_size << std::endl;  /////
+    // std::cout << " inv_range: " << inv_range << std::endl;  /////
+    // std::cout << " exp_res_index: " << exp_res_index << std::endl;  /////
+    if (exp_res_index < 0)
+        exp_res_index = 0;
+    if (exp_res_index > CONFIG_T::table_size - 1)
+        exp_res_index = CONFIG_T::table_size - 1;
+    deno_inver = invert_table[exp_res_index];
+    // std::cout << " deno_inver: " << deno_inver << std::endl;  /////
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = (res_T)(data_cache[ii] * deno_inver);
+    }
+
+    //	std::cout << "out SM: " << std::endl;
+    //    nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
+    //    std::cout << " " << std::endl;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS inline
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Implement tanh lookup
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
+        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = tanh(in_val);
+        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
+        // std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 8;
+        index = data_round + 4 * CONFIG_T::table_size / 8;
+        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)tanh_table[index];
+    }
+}
+
+// *************************************************
+//       UnaryLUT Activation
+// *************************************************
+template <int table_size, class data_T> inline unsigned get_index_unary_lut(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(table_size);
+    return (unsigned)(x(x.width - 1, 0));
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void unary_lut(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::table_t table[CONFIG_T::table_size]) {
+    #pragma HLS function_instantiate variable=table
+    #pragma HLS ARRAY_PARTITION variable=table
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        #pragma HLS UNROLL
+        unsigned index = get_index_unary_lut<CONFIG_T::table_size>(data[ii]);
+        res[ii] = (res_T)table[index];
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
+        res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    if (CONFIG_T::io_type == io_parallel) {
+        #pragma HLS PIPELINE
+    }
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            sigmoid = 1;
+        else if (sigmoid < 0)
+            sigmoid = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softplus function:
+    //   result = log(exp(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softsign function:
+    //   result = x / (abs(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softsign_table[index];
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
+
+template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default ELU function:
+    //   result = alpha * (e^(x) - 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = alpha * elu_table[index];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+inline float selu_fcn_float(float input) {
+    return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default SELU function:
+    //   result = 1.05 * (1.673 * (e^(x) - 1))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = selu_table[index];
+        }
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    #pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = 2 * data[ii];
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_array.h b/hls4ml/templates/vivado/nnet_utils/nnet_array.h
index d179102a99..de1b46c858 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_array.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_array.h
@@ -1,52 +1,52 @@
-#ifndef NNET_ARRAY_H_
-#define NNET_ARRAY_H_
-
-#include <math.h>
-
-namespace nnet {
-
-struct transpose_config {
-    static const unsigned height = 10;
-    static const unsigned width = 10;
-    static const unsigned depth = 10;
-    static constexpr unsigned perm[3] = {2, 0, 1};
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
-    #pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::height; i++) {
-        for (int j = 0; j < CONFIG_T::width; j++) {
-            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
-                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
-    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
-    unsigned dims_t[3];
-    dims_t[0] = dims[CONFIG_T::perm[0]];
-    dims_t[1] = dims[CONFIG_T::perm[1]];
-    dims_t[2] = dims[CONFIG_T::perm[2]];
-
-    int idx[3] = {0}, idx_t[3] = {0};
-    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
-        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
-            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
-                idx_t[0] = idx[CONFIG_T::perm[0]];
-                idx_t[1] = idx[CONFIG_T::perm[1]];
-                idx_t[2] = idx[CONFIG_T::perm[2]];
-
-                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
-                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_ARRAY_H_
+#define NNET_ARRAY_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned depth = 10;
+    static constexpr unsigned perm[3] = {2, 0, 1};
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::height; i++) {
+        for (int j = 0; j < CONFIG_T::width; j++) {
+            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
+                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
+    unsigned dims_t[3];
+    dims_t[0] = dims[CONFIG_T::perm[0]];
+    dims_t[1] = dims[CONFIG_T::perm[1]];
+    dims_t[2] = dims[CONFIG_T::perm[2]];
+
+    int idx[3] = {0}, idx_t[3] = {0};
+    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
+        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
+            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
+                idx_t[0] = idx[CONFIG_T::perm[0]];
+                idx_t[1] = idx[CONFIG_T::perm[1]];
+                idx_t[2] = idx[CONFIG_T::perm[2]];
+
+                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
+                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
index d8be45b73e..3a029fe860 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
@@ -1,124 +1,124 @@
-#ifndef NNET_BATCHNORM_H_
-#define NNET_BATCHNORM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <math.h>
-
-namespace nnet {
-
-struct batchnorm_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-    data_T cache;
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=scale,bias
-
-    // For parallel inputs:
-    //   - completely partition arrays -- target fabric
-    //   - if we have an unroll factor, limit number of multipliers
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    #pragma HLS ARRAY_PARTITION variable=scale complete
-    #pragma HLS ARRAY_PARTITION variable=bias complete
-
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-// Calcuate result
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
-        if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
-                        bias[ires];
-        } else {
-            int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
-                bias[norm_index];
-        }
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-struct batchnorm_quantized_tanh_config {
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-};
-
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in],
-                           data_T threshold[CONFIG_T::n_scale_bias]) {
-    #pragma HLS PIPELINE
-    #pragma HLS ARRAY_PARTITION variable=res complete
-
-    data_T datareg;
-    ap_uint<1> cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg >= threshold[norm_index])
-            cache = 1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in],
-                            data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) {
-    #pragma HLS PIPELINE
-    #pragma HLS ARRAY_PARTITION variable=res complete
-
-    data_T datareg;
-    ap_int<2> cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg > threshold_hi[norm_index])
-            cache = 1;
-        else if (datareg <= threshold_lo[norm_index])
-            cache = -1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    data_T cache;
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=scale,bias
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Calcuate result
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in],
+                           data_T threshold[CONFIG_T::n_scale_bias]) {
+    #pragma HLS PIPELINE
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_uint<1> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in],
+                            data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma HLS PIPELINE
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_int<2> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
index a064677d06..0cd9565fb5 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
@@ -1,123 +1,123 @@
-#ifndef NNET_BATCHNORM_STREAM_H_
-#define NNET_BATCHNORM_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include "nnet_types.h"
-
-namespace nnet {
-
-// ****************************************************
-//       Streaming Batch Normalization
-// ****************************************************
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(hls::stream<data_T> &data, hls::stream<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-    #pragma HLS ARRAY_PARTITION variable=scale complete
-    #pragma HLS ARRAY_PARTITION variable=bias complete
-
-    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-BatchNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        #pragma HLS PIPELINE II=ii
-
-        data_T in_data = data.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormpack:
-        for (int j = 0; j < data_T::size; j++) {
-            #pragma HLS UNROLL
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
-                              in_data[j], scale[norm_index]) +
-                          bias[norm_index];
-        }
-
-        res.write(out_data);
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias>> &res,
-                           typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
-    #pragma HLS ARRAY_PARTITION variable=threshold complete
-
-BinaryNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        data_T in_data = data.read();
-        nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias> out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormPack:
-        for (int j = 0; j < data_T::size; j++) {
-            #pragma HLS UNROLL
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_int<2>, CONFIG_T::n_scale_bias>> &res,
-                            typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
-                            typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
-    #pragma HLS ARRAY_PARTITION variable=threshold_hi complete
-    #pragma HLS ARRAY_PARTITION variable=threshold_lo complete
-
-TernaryNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        data_T in_data = data.read();
-        nnet::array<ap_int<2>, CONFIG_T::n_scale_bias> out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormPack:
-        for (int j = 0; j < data_T::size; j++) {
-            #pragma HLS UNROLL
-
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-
-            if (in_data[j] > threshold_hi[norm_index]) {
-                out_data[j] = 1;
-            } else if (in_data[j] <= threshold_lo[norm_index]) {
-                out_data[j] = -1;
-            } else {
-                out_data[j] = 0;
-            }
-        }
-
-        res.write(out_data);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_BATCHNORM_STREAM_H_
+#define NNET_BATCHNORM_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(hls::stream<data_T> &data, hls::stream<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+
+    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+BatchNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE II=ii
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormpack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                              in_data[j], scale[norm_index]) +
+                          bias[norm_index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias>> &res,
+                           typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=threshold complete
+
+BinaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_int<2>, CONFIG_T::n_scale_bias>> &res,
+                            typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
+                            typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=threshold_hi complete
+    #pragma HLS ARRAY_PARTITION variable=threshold_lo complete
+
+TernaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_int<2>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+
+            if (in_data[j] > threshold_hi[norm_index]) {
+                out_data[j] = 1;
+            } else if (in_data[j] <= threshold_lo[norm_index]) {
+                out_data[j] = -1;
+            } else {
+                out_data[j] = 0;
+            }
+        }
+
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index fed0395a1a..7a65548bed 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -1,75 +1,75 @@
-#ifndef NNET_COMMON_H_
-#define NNET_COMMON_H_
-
-#include "ap_fixed.h"
-
-// This is a substitute for "ceil(n/(float)d)".
-#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
-#define MIN(n, d) (n > d ? d : n)
-#define MAX(n, d) (n > d ? n : d)
-
-#define STRINGIFY(x) #x
-#define EXPAND_STRING(x) STRINGIFY(x)
-
-#ifndef __VITIS_HLS__
-#define DATA_PACK_TXT HLS DATA_PACK variable =
-#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
-#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
-#else
-#define PRAGMA_DATA_PACK(variable)
-#endif
-
-namespace nnet {
-
-// Common type definitions
-enum io_type { io_parallel = 0, io_stream };
-enum strategy { latency, resource };
-
-/* ---
- * Balanced tree reduce implementation.
- * For use in scenarios where Vivado cannot expression balance
- * Reduces an array of inputs to a single value using the template binary operator 'Op',
- * for example summing all elements with Op_add, or finding the maximum with Op_max
- * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
- * before applying and accumulate the result over the rolled dimension.
- * --- */
-template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
-    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
-    if (N == 1) {
-        return x[0];
-    }
-    if (N == 2) {
-        return op(x[0], x[1]);
-    }
-    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
-}
-
-template <class T> class Op_add {
-  public:
-    T operator()(T a, T b) { return a + b; }
-};
-
-template <class T> class Op_and {
-  public:
-    T operator()(T a, T b) { return a && b; }
-};
-
-template <class T> class Op_or {
-  public:
-    T operator()(T a, T b) { return a || b; }
-};
-
-template <class T> class Op_max {
-  public:
-    T operator()(T a, T b) { return a >= b ? a : b; }
-};
-
-template <class T> class Op_min {
-  public:
-    T operator()(T a, T b) { return a <= b ? a : b; }
-};
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#include "ap_fixed.h"
+
+// This is a substitute for "ceil(n/(float)d)".
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n > d ? n : d)
+
+#define STRINGIFY(x) #x
+#define EXPAND_STRING(x) STRINGIFY(x)
+
+#ifndef __VITIS_HLS__
+#define DATA_PACK_TXT HLS DATA_PACK variable =
+#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
+#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
+#else
+#define PRAGMA_DATA_PACK(variable)
+#endif
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+enum strategy { latency, resource };
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Vivado cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_and {
+  public:
+    T operator()(T a, T b) { return a && b; }
+};
+
+template <class T> class Op_or {
+  public:
+    T operator()(T a, T b) { return a || b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+template <class T> class Op_min {
+  public:
+    T operator()(T a, T b) { return a <= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index e2e0211b49..56617a4159 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -1,66 +1,66 @@
-#ifndef NNET_CONV1D_H_
-#define NNET_CONV1D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv1d_latency.h"
-#include "nnet_conv1d_resource.h"
-#include <cstdlib>
-
-namespace nnet {
-
-struct conv1d_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Convolutional parameters
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned in_width = 10;
-    static const unsigned n_chan = 0;
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_width;
-    static const unsigned n_filt = 1;
-    static const unsigned stride_width = 1;
-    static const unsigned dilation = 1;
-    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
-
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0; // not used yet
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS INLINE region
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    #pragma HLS INLINE region
-
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_latency.h"
+#include "nnet_conv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv1d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 0;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS INLINE region
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
index b23c330c78..431d3aa28d 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
@@ -1,89 +1,89 @@
-#ifndef NNET_CONV1D_STREAM_H_
-#define NNET_CONV1D_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_conv_stream.h"
-
-namespace nnet {
-
-template <class data_T, typename CONFIG_T>
-void compute_scaled_indices_1d(const unsigned w_idx, ap_uint<CONFIG_T::filt_width> *pixel_idx) {
-    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
-
-ComputeIndex:
-    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
-        #pragma HLS UNROLL
-        unsigned sw_idx =
-            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
-                wp_idx + p);
-        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-
-    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
-    const int win_depth = CONFIG_T::out_width;
-    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
-        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
-    }
-
-    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
-
-    res_T res_pack;
-    PRAGMA_DATA_PACK(res_pack)
-    unsigned outputs_ready = 0;
-
-    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
-    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
-
-ReadInputWidth:
-    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
-        #pragma HLS LOOP_FLATTEN
-        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
-            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        }
-        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
-        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
-                                                        biases, pixel_idx);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-
-ReadInputWidth:
-    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
-        #pragma HLS LOOP_FLATTEN
-        if (CONFIG_T::strategy == nnet::latency) {
-            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        }
-        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS inline recursive
-    switch (CONFIG_T::implementation) {
-    case conv_implementation::linebuffer:
-        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        break;
-    case conv_implementation::encoded:
-        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        break;
-    }
-}
-
-} // namespace nnet
-#endif
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void compute_scaled_indices_1d(const unsigned w_idx, ap_uint<CONFIG_T::filt_width> *pixel_idx) {
+    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
+
+ComputeIndex:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        #pragma HLS UNROLL
+        unsigned sw_idx =
+            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
+                wp_idx + p);
+        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
+        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
+                                                        biases, pixel_idx);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h
index 71a88f4483..c7c4158c6a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h
@@ -1,75 +1,75 @@
-#ifndef NNET_CONV2D_H_
-#define NNET_CONV2D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv2d_latency.h"
-#include "nnet_conv2d_resource.h"
-#include <cstdlib>
-
-namespace nnet {
-
-struct conv2d_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Convolutional parameters
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned n_chan = 1;
-    static const unsigned filt_height = 1;
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_height * filt_width;
-    static const unsigned n_filt = 1;
-    static const unsigned stride_height = 1;
-    static const unsigned stride_width = 1;
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-    static const unsigned dilation_height = 1;
-    static const unsigned dilation_width = 1;
-
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0; // not used yet
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_cl(
-    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS INLINE region
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    #pragma HLS INLINE region
-
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_latency.h"
+#include "nnet_conv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv2d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS INLINE region
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
index 5114af7825..3a481711db 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
@@ -1,89 +1,89 @@
-#ifndef NNET_CONV2D_LATENCY_H_
-#define NNET_CONV2D_LATENCY_H_
-
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <cstdlib>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_latency_cl(
-    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
-    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
-
-    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
-    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
-
-    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
-    #pragma HLS ARRAY_PARTITION variable=mult complete
-
-    typename CONFIG_T::accum_t acc[mult_n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-    #pragma HLS ARRAY_PARTITION variable=weights complete
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    // Limit multipliers to control parallelization
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
-
-PartitionLoop:
-    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
-
-        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
-
-    PixelLoop:
-        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
-            #pragma HLS UNROLL
-
-            data_T cache;
-
-        // Do the matrix-multiply
-        Product1:
-            for (int i_in = 0; i_in < mult_n_in; i_in++) {
-                #pragma HLS UNROLL
-                cache = data_buf[i_pxl][i_in];
-            Product2:
-                for (int i_out = 0; i_out < mult_n_out; i_out++) {
-                    #pragma HLS UNROLL
-                    mult[i_in * mult_n_out + i_out] =
-                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
-                            cache, weights[i_in * mult_n_out + i_out]);
-                }
-            }
-
-        // Initialize accumulator with input biases
-        ResetAccum:
-            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
-                #pragma HLS UNROLL
-                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
-            }
-
-        // Accumulate multiplication result
-        Accum1:
-            for (int i_in = 0; i_in < mult_n_in; i_in++) {
-                #pragma HLS UNROLL
-            Accum2:
-                for (int i_out = 0; i_out < mult_n_out; i_out++) {
-                    #pragma HLS UNROLL
-                    acc[i_out] += mult[i_in * mult_n_out + i_out];
-                }
-            }
-
-        // Cast to "res_t" type
-        Result:
-            for (int i_res = 0; i_res < mult_n_out; i_res++) {
-                #pragma HLS UNROLL
-                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
-            }
-        }
-    }
-}
-
-} // namespace nnet
-#endif
+#ifndef NNET_CONV2D_LATENCY_H_
+#define NNET_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_latency_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
index 5796e123e7..19cd5a63bb 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
@@ -1,57 +1,60 @@
-#ifndef NNET_DENSE_H_
-#define NNET_DENSE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense_latency.h"
-#include "nnet_dense_resource.h"
-#include "nnet_dense_seq.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-struct dense_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned strategy = latency;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-    // Product function to use
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template<class data_T, class res_T, typename CONFIG_T>
-void dense(
-    data_T    data[CONFIG_T::n_in*CONFIG_T::seq_len],
-    res_T     res[CONFIG_T::n_out*CONFIG_T::seq_len],
-    typename CONFIG_T::weight_t  weights[CONFIG_T::n_in*CONFIG_T::n_out],
-    typename CONFIG_T::bias_t    biases[CONFIG_T::n_out])
-{
-    #pragma HLS inline
-    if (CONFIG_T::seq_len > 1) {
-        dense_seq<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else{
-        if (CONFIG_T::strategy == nnet::latency) {
-            dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        } else {
-            dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        }
-    }
-}
-
-}
-
-#endif
+#ifndef NNET_DENSE_H_
+#define NNET_DENSE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense_latency.h"
+#include "nnet_dense_resource.h"
+#include "nnet_dense_seq.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS inline
+    if (CONFIG_T::seq_len > 1) {
+        dense_seq<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        } else {
+            dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        }
+    }
+
+    //    std::cout << "out Dense: " << std::endl;
+    //    for(int i=0; i < CONFIG_T::n_out*CONFIG_T::seq_len; ++i) {
+    //        std::cout << res[i] << " ";
+    //    }
+    //    std::cout << std::endl;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
index 029b74803b..5494ab4e36 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
@@ -1,90 +1,90 @@
-#ifndef NNET_COMPRESSED_LAYER_H_
-#define NNET_COMPRESSED_LAYER_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <math.h>
-
-namespace nnet {
-
-template <typename CONFIG_T>
-void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
-               typename CONFIG_T::accum_t weight) {
-    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
-        #pragma HLS UNROLL
-        if (k == index)
-            mult[k] += weight;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
-                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc    complete
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
-
-#ifdef __VITIS_HLS__
-    #pragma HLS AGGREGATE variable=weights
-#else
-    #pragma HLS data_pack variable=weights struct_level
-#endif
-
-InitAccum:
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        #pragma HLS UNROLL
-        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
-    }
-
-    // Do the compressed matrix-multiply
-    const int rufactor = CONFIG_T::reuse_factor;
-ReuseLoop:
-    for (unsigned ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE  II=1 rewind
-
-        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
-        #pragma HLS ARRAY_PARTITION variable=mult complete
-
-    ResetMult:
-        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
-            #pragma HLS UNROLL
-            mult[imult] = 0;
-        }
-
-    CompressedMultLoop:
-        for (unsigned im = 0; im < multiplier_limit; im++) {
-            #pragma HLS UNROLL
-            unsigned w = im * rufactor + ir;
-            auto row = weights[w].row_index;
-            auto col = weights[w].col_index;
-            auto weight_cache = weights[w].weight;
-            data_T data_cache = data[row];
-            // mult[col] += weight_cache * data_cache;
-            typename CONFIG_T::accum_t prod =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
-            fill_mult<CONFIG_T>(col, mult, prod);
-        }
-
-        for (int im = 0; im < CONFIG_T::n_out; im++) {
-            acc[im] += mult[im];
-        }
-    }
-
-// Cast to "res_t" type
-ResultLoop:
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        #pragma HLS UNROLL
-        // res[i] = (res_T) (acc[i]);
-        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+template <typename CONFIG_T>
+void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
+               typename CONFIG_T::accum_t weight) {
+    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
+        #pragma HLS UNROLL
+        if (k == index)
+            mult[k] += weight;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc    complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
+
+#ifdef __VITIS_HLS__
+    #pragma HLS AGGREGATE variable=weights
+#else
+    #pragma HLS data_pack variable=weights struct_level
+#endif
+
+InitAccum:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        #pragma HLS UNROLL
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    // Do the compressed matrix-multiply
+    const int rufactor = CONFIG_T::reuse_factor;
+ReuseLoop:
+    for (unsigned ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE  II=1 rewind
+
+        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    CompressedMultLoop:
+        for (unsigned im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            unsigned w = im * rufactor + ir;
+            auto row = weights[w].row_index;
+            auto col = weights[w].col_index;
+            auto weight_cache = weights[w].weight;
+            data_T data_cache = data[row];
+            // mult[col] += weight_cache * data_cache;
+            typename CONFIG_T::accum_t prod =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
+            fill_mult<CONFIG_T>(col, mult, prod);
+        }
+
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += mult[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        #pragma HLS UNROLL
+        // res[i] = (res_T) (acc[i]);
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
index 3215eeb4c5..c31958d3e5 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
@@ -1,72 +1,72 @@
-#ifndef NNET_DENSE_LATENCY_H_
-#define NNET_DENSE_LATENCY_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    data_T cache;
-    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=weights,biases
-
-    // For parallel inputs:
-    //   - completely partition arrays -- target fabric
-    //   - if we have an unroll factor, limit number of multipliers
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-    #pragma HLS ARRAY_PARTITION variable=mult complete
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-// Do the matrix-multiply
-Product1:
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        cache = data[ii];
-    Product2:
-        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
-            int index = ii * CONFIG_T::n_out + jj;
-            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
-        }
-    }
-
-// Initialize accumulator with input biases
-ResetAccum:
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-// Accumulate multiplication result
-Accum1:
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-    Accum2:
-        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
-            int index = ii * CONFIG_T::n_out + jj;
-            acc[jj] += mult[index];
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        // res[ires] = (res_T) (acc[ires]);
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_DENSE_LATENCY_H_
+#define NNET_DENSE_LATENCY_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    data_T cache;
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Do the matrix-multiply
+Product1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        cache = data[ii];
+    Product2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
+        }
+    }
+
+// Initialize accumulator with input biases
+ResetAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+// Accumulate multiplication result
+Accum1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+    Accum2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            acc[jj] += mult[index];
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        // res[ires] = (res_T) (acc[ires]);
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
index 88de94729b..1ff33a34fb 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
@@ -1,263 +1,263 @@
-#ifndef NNET_DENSE_RESOURCE_H_
-#define NNET_DENSE_RESOURCE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <assert.h>
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = CONFIG_T::reuse_factor;
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-
-        int w_index = ir;
-        int in_index = ir;
-        int out_index = 0;
-        int acc_step = 0;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
-
-            // Increment w_index
-            w_index += rufactor;
-            // Increment in_index
-            in_index += rufactor;
-            if (in_index >= nin) {
-                in_index = ir;
-            }
-            // Increment out_index
-            if (acc_step + 1 >= multscale) {
-                acc_step = 0;
-                out_index++;
-            } else {
-                acc_step++;
-            }
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-    int w_index;
-    int in_index = 0;
-    int out_index;
-    int outstep = 0;
-    const int outscale = rufactor / nin;
-
-    int outidx[rufactor];
-IndexLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        outidx[ir] = outstep;
-        if ((ir + 1) % nin == 0) {
-            outstep++;
-        }
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-
-        w_index = ir;
-        out_index = outidx[ir] /*outstep*/;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
-
-            w_index += rufactor;
-            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
-                break; // check out of bounds
-            out_index += outscale;
-        }
-
-        in_index++;
-        if (in_index >= nin) {
-            in_index = 0;
-            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = CONFIG_T::reuse_factor;
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-        typename CONFIG_T::accum_t tmpmult[block_factor];
-        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            int w_index = ir + rufactor * im;
-            int in_index = w_index % nin;
-            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
-                continue; // check out of bounds
-            tmpmult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
-        }
-
-        typename CONFIG_T::accum_t mult[multiplier_limit];
-        #pragma HLS ARRAY_PARTITION variable=mult complete
-
-    ResetMult:
-        for (int imult = 0; imult < multiplier_limit; imult++) {
-            #pragma HLS UNROLL
-            mult[imult] = 0;
-        }
-
-    AccumLoop1:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            int w_index = ir + rufactor * im;
-            int out_index = w_index / multfactor;
-            if (out_index >= multiplier_limit)
-                continue; // check out of bounds
-            mult[out_index] += tmpmult[im];
-        }
-
-    AccumLoop2:
-        for (int im = 0; im < multiplier_limit; im++) {
-            #pragma HLS UNROLL
-            // int out_index = im/multscale; // This is the general case
-            // acc[out_index] += mult[im];
-            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    #pragma HLS INLINE recursive
-
-    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
-        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <assert.h>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int w_index = ir;
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            // Increment w_index
+            w_index += rufactor;
+            // Increment in_index
+            in_index += rufactor;
+            if (in_index >= nin) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+    int w_index;
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = rufactor / nin;
+
+    int outidx[rufactor];
+IndexLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % nin == 0) {
+            outstep++;
+        }
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        w_index = ir;
+        out_index = outidx[ir] /*outstep*/;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            w_index += rufactor;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                break; // check out of bounds
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= nin) {
+            in_index = 0;
+            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int in_index = w_index % nin;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue; // check out of bounds
+            tmpmult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int out_index = w_index / multfactor;
+            if (out_index >= multiplier_limit)
+                continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+    AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            // int out_index = im/multscale; // This is the general case
+            // acc[out_index] += mult[im];
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    #pragma HLS INLINE recursive
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
index e791276326..4b6e0d08e7 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
@@ -1,47 +1,44 @@
-#ifndef NNET_DENSE_SEQ_H_
-#define NNET_DENSE_SEQ_H_
-
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include "nnet_helpers.h"
-#include "hls_stream.h"
-#include <math.h>
-
-namespace nnet {
-
-template<class data_T, class res_T, typename CONFIG_T>
-void dense_seq(
-    data_T    data[CONFIG_T::n_in*CONFIG_T::seq_len],
-    res_T     res[CONFIG_T::n_out*CONFIG_T::seq_len],
-    typename CONFIG_T::weight_t  weights[CONFIG_T::n_in*CONFIG_T::n_out],
-    typename CONFIG_T::bias_t    biases[CONFIG_T::n_out])
-{
-    #pragma HLS inline
-
-    data_T in_val[CONFIG_T::n_in];
-    #pragma HLS ARRAY_PARTITION variable=in_val complete
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        for (int j=0; j <CONFIG_T::seq_len; ++j){
-		#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-            for (int i=0; i < CONFIG_T::n_in; ++i){
-            #pragma HLS UNROLL
-                in_val[i] = data[j*CONFIG_T::n_in+i];
-            }
-            dense_latency<data_T, res_T, CONFIG_T>(in_val, res+(CONFIG_T::n_out*j), weights, biases);
-        }
-    } else {
-        for (int j=0; j <CONFIG_T::seq_len; ++j){
-		#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-            for (int i=0; i < CONFIG_T::n_in; ++i){
-            #pragma HLS UNROLL
-                in_val[i] = data[j*CONFIG_T::n_in+i];
-            }
-            dense_resource<data_T, res_T, CONFIG_T>(in_val, res+(CONFIG_T::n_out*j), weights, biases);
-        }
-    }
-}
-
-}
-
-#endif
+#ifndef NNET_DENSE_SEQ_H_
+#define NNET_DENSE_SEQ_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_seq(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
+               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS inline
+
+    data_T in_val[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=in_val complete
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            for (int i = 0; i < CONFIG_T::n_in; ++i) {
+                #pragma HLS UNROLL
+                in_val[i] = data[j * CONFIG_T::n_in + i];
+            }
+            dense_latency<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
+        }
+    } else {
+        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            for (int i = 0; i < CONFIG_T::n_in; ++i) {
+                #pragma HLS UNROLL
+                in_val[i] = data[j * CONFIG_T::n_in + i];
+            }
+            dense_resource<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h b/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h
index 1fcd554598..fb5fe72fcf 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h
@@ -1,816 +1,816 @@
-#ifndef NNET_GARNET_H_
-#define NNET_GARNET_H_
-
-#include "hls_math.h"
-#include "hls_stream.h"
-#include "nnet_common.h"
-
-namespace nnet {
-namespace garnet_utils {
-
-template <class CONFIG_T>
-inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
-initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    typedef ap_uint<CONFIG_T::distance_width> index_t;
-
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-
-    index_t index;
-    typename CONFIG_T::distance_t distance;
-
-    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
-    edge_weights_table[0] = ap_ufixed<CONFIG_T::edge_weight_t::width, 0, AP_TRN, AP_SAT>(1.);
-
-    for (unsigned iw = 1; iw < table_size; ++iw) {
-        index = iw;
-        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
-        edge_weights_table[iw] = hls::exp(-distance * distance);
-    }
-}
-
-template <class CONFIG_T>
-inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
-initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-    double const step = 64. / table_size;
-
-    typename CONFIG_T::distance_t v = -32.;
-    for (unsigned iw = 0; iw < table_size; ++iw) {
-        edge_weights_table[iw] = std::exp(-v * v);
-        v += step;
-    }
-}
-
-template <class CONFIG_T>
-inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
-get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    typedef ap_uint<CONFIG_T::distance_width> index_t;
-
-    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
-
-    return edge_weights_table[index];
-}
-
-template <class CONFIG_T>
-inline
-    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
-    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-    double const step = 64. / table_size;
-
-    int index = (distance + 32.) / step;
-    if (index < 0)
-        index = 0;
-    else if (index >= table_size)
-        index = table_size - 1;
-
-    return edge_weights_table[index];
-}
-
-template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
-    if (CONFIG_T::is_stack) {
-        #pragma HLS INLINE OFF
-    }
-#ifdef __SYNTHESIS__
-    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
-    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
-    // CONFIG_T::reuse_factor);
-    // #pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
-    bool initialized = false;
-#else
-    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
-    static bool initialized = false;
-#endif
-    if (not initialized) {
-        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
-        initialized = true;
-    }
-
-    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
-}
-
-template <class dividend_T, class exponent_T>
-inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
-                                                                                                  exponent_T exponent) {
-    #pragma HLS INLINE
-    return dividend >> exponent;
-}
-
-template <class dividend_T, class exponent_T>
-inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
-                                                                                                      exponent_T exponent) {
-    #pragma HLS INLINE
-    return dividend / std::pow(2., exponent);
-}
-
-template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
-    typedef E edge_weight_t;
-
-    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
-    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
-
-    Means() {
-        #pragma HLS INLINE
-        #pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
-        #pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
-        #pragma HLS UNROLL region
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] = 0.;
-
-        InFeatures:
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-                weighted_feature_mean[iax] = 0.;
-            }
-        }
-    }
-
-    void set_weight(unsigned, edge_weight_t const &) {
-        #pragma HLS INLINE
-    }
-
-    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
-        #pragma HLS INLINE
-        // Always called within a pipelined region - no UNROLL needed
-
-        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
-
-        InFeatures:
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
-            }
-        }
-    }
-
-    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
-    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
-        #pragma HLS INLINE
-        #pragma HLS UNROLL region
-
-        // accum comes divided by unroll factor
-        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
-
-    Aggregators:
-        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
-
-        InFeatures:
-            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
-                unsigned const iax = ia * T::n_in_features + ix;
-
-                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
-            }
-        }
-    }
-
-    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
-    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
-        #pragma HLS INLINE
-        #pragma HLS UNROLL region
-
-    Aggregators:
-        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
-
-            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
-
-        InFeatures:
-            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
-                unsigned const iax = ia * T::n_in_features + ix;
-
-                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
-            }
-        }
-    }
-};
-
-template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
-    typedef E edge_weight_t;
-
-    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
-
-    WeightsAndMeans() : Means<CONFIG_T, E>() {
-        #pragma HLS INLINE
-        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
-        #pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
-    }
-
-    void set_weight(unsigned iva, edge_weight_t const &weight) {
-        #pragma HLS INLINE
-        edge_weights[iva] = weight;
-    }
-};
-
-template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
-
-template <class CONFIG_T, class nvtx_T>
-struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
-    typedef typename CONFIG_T::output_transform_biases_t biases_t;
-
-    biases_t const (&output_biases)[CONFIG_T::n_out_features];
-
-    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
-        #pragma HLS INLINE
-    }
-};
-
-template <class CONFIG_T, class nvtx_T>
-struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
-    typedef typename CONFIG_T::output_transform_biases_t biases_t;
-
-    biases_t output_biases[CONFIG_T::n_out_features];
-
-    OutputBiasNormalizer(nvtx_T const nvtx) {
-        #pragma HLS ARRAY_PARTITION variable=output_biases complete
-        #pragma HLS UNROLL region
-
-        // Cannot add a loop label here due to a Vivado HLS bug, apparently
-        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
-            bias *= nvtx;
-            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
-        }
-    }
-};
-
-template <class CONFIG_T, class data_T> struct InputDataGetter {
-    typedef data_T data_t;
-
-    data_T const *dataref;
-
-    InputDataGetter(data_T const *d) : dataref{d} {
-        #pragma HLS INLINE
-    }
-    data_T const &get(unsigned iv, unsigned ix) const {
-        #pragma HLS INLINE
-        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-        return dataref[ivx];
-    }
-};
-
-template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
-    typedef data_T data_t;
-
-    data_T const (&dataref)[CONFIG_T::n_in_features];
-
-    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
-        #pragma HLS INLINE
-    }
-    data_T const &get(unsigned, unsigned ix) const {
-        #pragma HLS INLINE
-        return dataref[ix];
-    }
-};
-
-template <class CONFIG_T, class res_T> struct OutputResSetter {
-    typedef res_T res_t;
-
-    res_T *resref;
-
-    OutputResSetter(res_T *r) : resref{r} {
-        #pragma HLS INLINE
-    }
-    void set(unsigned iv, unsigned io, res_T const &acc) {
-        #pragma HLS INLINE
-        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-        resref[ivo] = acc;
-    }
-};
-
-template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
-    typedef res_T res_t;
-
-    res_T (&resref)[CONFIG_T::n_out_features];
-
-    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
-        #pragma HLS INLINE
-    }
-    void set(unsigned, unsigned io, res_T const &acc) {
-        #pragma HLS INLINE
-        resref[io] = acc;
-    }
-};
-
-template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
-inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
-                                       arrays_T &arrays) {
-    #pragma HLS INLINE
-
-Aggregators:
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
-
-    InFeatures1:
-        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
-
-            distance += incr;
-        }
-
-        typename CONFIG_T::edge_weight_t edge_weight =
-            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
-
-        arrays_local.edge_weight_mean[ia] += edge_weight;
-
-    InFeatures2:
-        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
-
-            arrays_local.weighted_feature_mean[iax] += incr;
-        }
-
-        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-        arrays.set_weight(iva, edge_weight);
-    }
-}
-
-template <class CONFIG_T, class arrays_T>
-inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
-    #pragma HLS INLINE
-    #pragma HLS UNROLL region
-
-    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
-
-InFeatures:
-    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
-        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
-    }
-
-    return aggr;
-}
-
-template <class CONFIG_T, class arrays_T>
-inline void compute_output_base(arrays_T const &arrays,
-                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
-    #pragma HLS INLINE
-    #pragma HLS UNROLL region
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
-        }
-    }
-}
-
-template <class CONFIG_T, class arrays_T, class res_setter_T>
-inline void
-compute_vertex_output(arrays_T const &arrays, unsigned iv,
-                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
-                      res_setter_T &res_setter) {
-    #pragma HLS INLINE
-
-    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
-    #pragma HLS ARRAY_PARTITION variable=edge_weights complete
-
-Aggregators1:
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-
-        edge_weights[ia] = arrays.edge_weights[iva];
-    }
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
-
-    Aggregators2:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
-            acc += incr;
-        }
-
-        res_setter.set(iv, io, acc);
-    }
-}
-
-template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
-void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
-    InputDataGetter<CONFIG_T, data_T> data_getter(data);
-
-    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
-
-    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
-        #pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
-        }
-
-        means_accum.add_means_normalized(means_local);
-    }
-
-    arrays.set_means_normalized(nvtx, means_accum);
-}
-
-template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
-void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    OutputResSetter<CONFIG_T, res_T> res_setter(res);
-
-    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
-    #pragma HLS ARRAY_PARTITION variable=output_base complete
-
-    compute_output_base<CONFIG_T>(arrays, output_base);
-
-    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
-        #pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
-        }
-    }
-}
-
-template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
-void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
-                res_T res[CONFIG_T::n_out_features]) {
-    #pragma HLS PIPELINE
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        res_T acc = output_transform_biases.output_biases[io];
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
-
-            acc += arrays.edge_weight_mean[ia] * aggr;
-        }
-
-        res[io] = acc;
-    }
-}
-
-template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
-void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
-    typedef typename prev_layer_t::output_t data_T;
-
-    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
-    #pragma HLS ARRAY_PARTITION variable=prev_output_base complete
-
-    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
-
-    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
-
-    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
-        #pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            data_T data[prev_layer_t::n_out_features];
-            #pragma HLS ARRAY_PARTITION variable=data complete
-
-            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
-
-            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
-
-            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
-
-            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
-        }
-
-        means_accum.add_means_normalized(means_local);
-    }
-
-    current_arrays.set_means_normalized(nvtx, means_accum);
-}
-
-template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
-          class last_arrays_T>
-inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
-sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
-    #pragma HLS INLINE
-
-    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
-}
-
-template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
-          class last_arrays_T>
-inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
-sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
-    #pragma HLS INLINE
-
-    WeightsAndMeans<current_layer_t> current_arrays;
-
-    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
-
-    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
-}
-} // namespace garnet_utils
-
-struct garnet_config {
-    // Layer specs
-    static const unsigned n_vertices_width = 8;
-    static const unsigned n_vertices = (1 << n_vertices_width);
-    static const unsigned n_in_features = 4;
-    static const unsigned n_propagate = 4;
-    static const unsigned n_aggregators = 4;
-    static const unsigned n_out_features = 4;
-    static const unsigned distance_width = 12;
-
-    // Internal data type definitions
-    typedef float input_transform_weights_t;
-    typedef float input_transform_biases_t;
-    typedef float output_transform_weights_t;
-    typedef float output_transform_biases_t;
-    typedef float aggregator_distance_weights_t;
-    typedef float aggregator_distance_biases_t;
-
-    typedef float norm_t;
-    typedef float distance_t;
-    typedef float edge_weight_t;
-    typedef float edge_weight_aggr_t;
-    typedef float aggr_t;
-    typedef float output_t;
-
-    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
-    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
-    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
-    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
-    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
-
-    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
-
-    static const unsigned output_collapse = no_collapse;
-
-    static const bool mean_by_nvert = false;
-    static const bool is_stack = false;
-
-    // Optimization specs
-    static const unsigned reuse_factor = 64;
-    static const unsigned log2_reuse_factor = 6;
-};
-
-// vertices -> vertices
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
-
-    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
-
-    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
-}
-
-// vertices -> out features
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-       res_T res[CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    garnet_utils::Means<CONFIG_T> arrays;
-
-    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
-
-    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
-
-    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
-}
-
-// vertices -> vertices
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
-    unsigned const ilast = CONFIG_T::n_sublayers - 1;
-    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
-
-    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
-    garnet_utils::Means<last_layer_t> arrays_last;
-
-    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
-
-    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
-                                                                                              arrays_last);
-
-    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
-}
-
-// vertices -> out features
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-             res_T res[CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
-    unsigned const ilast = CONFIG_T::n_sublayers - 1;
-    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
-
-    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
-    garnet_utils::Means<last_layer_t> arrays_last;
-
-    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
-
-    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
-                                                                                              arrays_last);
-
-    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
-
-    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
-}
-
-/* Reference (dumb) implementation returning (Vertices, Features) */
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
-    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
-
-    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-        if (iv == nvtx[0])
-            break;
-
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
-
-            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
-
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
-
-                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
-            }
-        }
-
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-
-            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
-
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
-            }
-
-            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
-        }
-    }
-
-    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
-
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-
-            aggregated_features[iap] = 0.;
-
-            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-                if (iv == nvtx[0])
-                    break;
-
-                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
-
-                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
-            }
-        }
-    }
-
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-
-            if (CONFIG_T::mean_by_nvert)
-                aggregated_features[iap] /= nvtx[0];
-            else {
-                // Not using right shift in case aggr_t is float or double
-                aggregated_features[iap] /= CONFIG_T::n_vertices;
-            }
-        }
-    }
-
-    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-        if (iv == nvtx[0])
-            break;
-
-        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-
-            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
-
-            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-                typename CONFIG_T::aggr_t aggr = 0.;
-
-                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
-
-                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
-                }
-
-                acc += edge_weights[iva] * aggr;
-            }
-
-            res[ivo] = acc;
-        }
-    }
-}
-
-/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-           res_T res[CONFIG_T::n_out_features]) {
-    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
-
-    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
-
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        typename CONFIG_T::aggr_t acc = 0.;
-
-        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-            if (iv == nvtx[0])
-                break;
-
-            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-
-            acc += vertex_res[ivo];
-        }
-
-        if (CONFIG_T::mean_by_nvert)
-            acc /= nvtx[0];
-        else {
-            // Not using right shift in case aggr_t is float or double
-            acc /= CONFIG_T::n_vertices;
-        }
-
-        res[io] = acc;
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_GARNET_H_
+#define NNET_GARNET_H_
+
+#include "hls_math.h"
+#include "hls_stream.h"
+#include "nnet_common.h"
+
+namespace nnet {
+namespace garnet_utils {
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+
+    index_t index;
+    typename CONFIG_T::distance_t distance;
+
+    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
+    edge_weights_table[0] = ap_ufixed<CONFIG_T::edge_weight_t::width, 0, AP_TRN, AP_SAT>(1.);
+
+    for (unsigned iw = 1; iw < table_size; ++iw) {
+        index = iw;
+        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
+        edge_weights_table[iw] = hls::exp(-distance * distance);
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    typename CONFIG_T::distance_t v = -32.;
+    for (unsigned iw = 0; iw < table_size; ++iw) {
+        edge_weights_table[iw] = std::exp(-v * v);
+        v += step;
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T>
+inline
+    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    int index = (distance + 32.) / step;
+    if (index < 0)
+        index = 0;
+    else if (index >= table_size)
+        index = table_size - 1;
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
+    if (CONFIG_T::is_stack) {
+        #pragma HLS INLINE OFF
+    }
+#ifdef __SYNTHESIS__
+    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
+    // CONFIG_T::reuse_factor);
+    // #pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
+    bool initialized = false;
+#else
+    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    static bool initialized = false;
+#endif
+    if (not initialized) {
+        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
+        initialized = true;
+    }
+
+    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                  exponent_T exponent) {
+    #pragma HLS INLINE
+    return dividend >> exponent;
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                      exponent_T exponent) {
+    #pragma HLS INLINE
+    return dividend / std::pow(2., exponent);
+}
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
+
+    Means() {
+        #pragma HLS INLINE
+        #pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
+        #pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
+        #pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = 0.;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] = 0.;
+            }
+        }
+    }
+
+    void set_weight(unsigned, edge_weight_t const &) {
+        #pragma HLS INLINE
+    }
+
+    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
+        #pragma HLS INLINE
+        // Always called within a pipelined region - no UNROLL needed
+
+        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        #pragma HLS INLINE
+        #pragma HLS UNROLL region
+
+        // accum comes divided by unroll factor
+        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        #pragma HLS INLINE
+        #pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+
+            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
+            }
+        }
+    }
+};
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+
+    WeightsAndMeans() : Means<CONFIG_T, E>() {
+        #pragma HLS INLINE
+        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
+        #pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
+    }
+
+    void set_weight(unsigned iva, edge_weight_t const &weight) {
+        #pragma HLS INLINE
+        edge_weights[iva] = weight;
+    }
+};
+
+template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t const (&output_biases)[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
+        #pragma HLS INLINE
+    }
+};
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t output_biases[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const nvtx) {
+        #pragma HLS ARRAY_PARTITION variable=output_biases complete
+        #pragma HLS UNROLL region
+
+        // Cannot add a loop label here due to a Vivado HLS bug, apparently
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
+            bias *= nvtx;
+            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
+        }
+    }
+};
+
+template <class CONFIG_T, class data_T> struct InputDataGetter {
+    typedef data_T data_t;
+
+    data_T const *dataref;
+
+    InputDataGetter(data_T const *d) : dataref{d} {
+        #pragma HLS INLINE
+    }
+    data_T const &get(unsigned iv, unsigned ix) const {
+        #pragma HLS INLINE
+        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+        return dataref[ivx];
+    }
+};
+
+template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
+    typedef data_T data_t;
+
+    data_T const (&dataref)[CONFIG_T::n_in_features];
+
+    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
+        #pragma HLS INLINE
+    }
+    data_T const &get(unsigned, unsigned ix) const {
+        #pragma HLS INLINE
+        return dataref[ix];
+    }
+};
+
+template <class CONFIG_T, class res_T> struct OutputResSetter {
+    typedef res_T res_t;
+
+    res_T *resref;
+
+    OutputResSetter(res_T *r) : resref{r} {
+        #pragma HLS INLINE
+    }
+    void set(unsigned iv, unsigned io, res_T const &acc) {
+        #pragma HLS INLINE
+        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+        resref[ivo] = acc;
+    }
+};
+
+template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
+    typedef res_T res_t;
+
+    res_T (&resref)[CONFIG_T::n_out_features];
+
+    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
+        #pragma HLS INLINE
+    }
+    void set(unsigned, unsigned io, res_T const &acc) {
+        #pragma HLS INLINE
+        resref[io] = acc;
+    }
+};
+
+template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
+inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
+                                       arrays_T &arrays) {
+    #pragma HLS INLINE
+
+Aggregators:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+    InFeatures1:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
+
+            distance += incr;
+        }
+
+        typename CONFIG_T::edge_weight_t edge_weight =
+            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
+
+        arrays_local.edge_weight_mean[ia] += edge_weight;
+
+    InFeatures2:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
+
+            arrays_local.weighted_feature_mean[iax] += incr;
+        }
+
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+        arrays.set_weight(iva, edge_weight);
+    }
+}
+
+template <class CONFIG_T, class arrays_T>
+inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
+    #pragma HLS INLINE
+    #pragma HLS UNROLL region
+
+    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
+
+InFeatures:
+    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
+        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
+    }
+
+    return aggr;
+}
+
+template <class CONFIG_T, class arrays_T>
+inline void compute_output_base(arrays_T const &arrays,
+                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
+    #pragma HLS INLINE
+    #pragma HLS UNROLL region
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+        }
+    }
+}
+
+template <class CONFIG_T, class arrays_T, class res_setter_T>
+inline void
+compute_vertex_output(arrays_T const &arrays, unsigned iv,
+                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
+                      res_setter_T &res_setter) {
+    #pragma HLS INLINE
+
+    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=edge_weights complete
+
+Aggregators1:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+        edge_weights[ia] = arrays.edge_weights[iva];
+    }
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
+
+    Aggregators2:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
+            acc += incr;
+        }
+
+        res_setter.set(iv, io, acc);
+    }
+}
+
+template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
+void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
+    InputDataGetter<CONFIG_T, data_T> data_getter(data);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
+void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    OutputResSetter<CONFIG_T, res_T> res_setter(res);
+
+    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=output_base complete
+
+    compute_output_base<CONFIG_T>(arrays, output_base);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
+        }
+    }
+}
+
+template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
+void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
+                res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS PIPELINE
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        res_T acc = output_transform_biases.output_biases[io];
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+
+            acc += arrays.edge_weight_mean[ia] * aggr;
+        }
+
+        res[io] = acc;
+    }
+}
+
+template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
+void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
+    typedef typename prev_layer_t::output_t data_T;
+
+    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=prev_output_base complete
+
+    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
+
+    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
+
+    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            data_T data[prev_layer_t::n_out_features];
+            #pragma HLS ARRAY_PARTITION variable=data complete
+
+            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
+
+            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
+
+            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
+
+            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    current_arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    #pragma HLS INLINE
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    #pragma HLS INLINE
+
+    WeightsAndMeans<current_layer_t> current_arrays;
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
+
+    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
+}
+} // namespace garnet_utils
+
+struct garnet_config {
+    // Layer specs
+    static const unsigned n_vertices_width = 8;
+    static const unsigned n_vertices = (1 << n_vertices_width);
+    static const unsigned n_in_features = 4;
+    static const unsigned n_propagate = 4;
+    static const unsigned n_aggregators = 4;
+    static const unsigned n_out_features = 4;
+    static const unsigned distance_width = 12;
+
+    // Internal data type definitions
+    typedef float input_transform_weights_t;
+    typedef float input_transform_biases_t;
+    typedef float output_transform_weights_t;
+    typedef float output_transform_biases_t;
+    typedef float aggregator_distance_weights_t;
+    typedef float aggregator_distance_biases_t;
+
+    typedef float norm_t;
+    typedef float distance_t;
+    typedef float edge_weight_t;
+    typedef float edge_weight_aggr_t;
+    typedef float aggr_t;
+    typedef float output_t;
+
+    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
+    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
+    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
+    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
+    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
+
+    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
+
+    static const unsigned output_collapse = no_collapse;
+
+    static const bool mean_by_nvert = false;
+    static const bool is_stack = false;
+
+    // Optimization specs
+    static const unsigned reuse_factor = 64;
+    static const unsigned log2_reuse_factor = 6;
+};
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    garnet_utils::Means<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
+}
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
+}
+
+/* Reference (dumb) implementation returning (Vertices, Features) */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
+
+                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
+            }
+        }
+
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
+            }
+
+            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
+        }
+    }
+
+    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            aggregated_features[iap] = 0.;
+
+            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+                if (iv == nvtx[0])
+                    break;
+
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
+            }
+        }
+    }
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            if (CONFIG_T::mean_by_nvert)
+                aggregated_features[iap] /= nvtx[0];
+            else {
+                // Not using right shift in case aggr_t is float or double
+                aggregated_features[iap] /= CONFIG_T::n_vertices;
+            }
+        }
+    }
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
+
+            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+                typename CONFIG_T::aggr_t aggr = 0.;
+
+                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
+
+                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
+                }
+
+                acc += edge_weights[iva] * aggr;
+            }
+
+            res[ivo] = acc;
+        }
+    }
+}
+
+/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_out_features]) {
+    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
+
+    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
+
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename CONFIG_T::aggr_t acc = 0.;
+
+        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+            if (iv == nvtx[0])
+                break;
+
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            acc += vertex_res[ivo];
+        }
+
+        if (CONFIG_T::mean_by_nvert)
+            acc /= nvtx[0];
+        else {
+            // Not using right shift in case aggr_t is float or double
+            acc /= CONFIG_T::n_vertices;
+        }
+
+        res[io] = acc;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index b8c2a48d19..1a3a3d28b5 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -1,382 +1,382 @@
-#ifndef NNET_HELPERS_H
-#define NNET_HELPERS_H
-
-#include "hls_stream.h"
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
-namespace nnet {
-
-#ifndef __SYNTHESIS__
-
-#ifndef WEIGHTS_DIR
-#define WEIGHTS_DIR "weights"
-#endif
-
-template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-
-        size_t i = 0;
-        while (std::getline(iss, token, ',')) {
-            std::istringstream(token) >> w[i];
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-
-template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-        std::string extra_chars = "} ";
-
-        size_t i = 0;
-        while (std::getline(iss, token, '{')) {
-            if (token.length() == 0) {
-                continue;
-            }
-            for (char c : extra_chars) {
-                token.erase(std::remove(token.begin(), token.end(), c), token.end());
-            }
-            if (token.back() == ',') {
-                token.erase(token.end() - 1);
-            }
-
-            std::replace(token.begin(), token.end(), ',', ' ');
-            std::istringstream structss(token);
-
-            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
-                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
-                exit(1);
-            }
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-
-template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-        std::string extra_chars = "} ";
-
-        size_t i = 0;
-        while (std::getline(iss, token, '{')) {
-            if (token.length() == 0) {
-                continue;
-            }
-            for (char c : extra_chars) {
-                token.erase(std::remove(token.begin(), token.end(), c), token.end());
-            }
-            if (token.back() == ',') {
-                token.erase(token.end() - 1);
-            }
-
-            std::replace(token.begin(), token.end(), ',', ' ');
-            std::istringstream structss(token);
-
-            if (!(structss >> w[i].sign >> w[i].weight)) {
-                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
-                exit(1);
-            }
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = dstType(src[i]);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<dstType> &dst) {
-    for (size_t i = 0; i < SIZE / dstType::size; i++) {
-        dstType ctype;
-        for (size_t j = 0; j < dstType::size; j++) {
-            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
-        }
-        dst.write(ctype);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stream<srcType> &src, dstType *dst) {
-    for (size_t i = 0; i < SIZE / srcType::size; i++) {
-        srcType ctype = src.read();
-        for (size_t j = 0; j < srcType::size; j++) {
-            dst[i * srcType::size + j] = dstType(ctype[j]);
-        }
-    }
-}
-
-extern bool trace_enabled;
-extern std::map<std::string, void *> *trace_outputs;
-extern size_t trace_type_size;
-
-template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
-    for (int i = 0; i < layer_size; i++) {
-        ptr[i] = save_T(data[i]);
-    }
-}
-
-template <class data_T, class save_T> void save_output_array(hls::stream<data_T> &data, save_T *ptr, size_t layer_size) {
-    for (size_t i = 0; i < layer_size / data_T::size; i++) {
-        data_T ctype = data.read();
-        for (size_t j = 0; j < data_T::size; j++) {
-            ptr[i * data_T::size + j] = save_T(ctype[j]);
-        }
-        data.write(ctype);
-    }
-}
-
-// We don't want to include save_T in this function because it will be inserted into myproject.cpp
-// so a workaround with element size is used
-template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (int i = 0; i < layer_size; i++) {
-            out << float(data[i]) << " "; // We don't care about precision in text files
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-template <class data_T> void save_layer_output(hls::stream<data_T> &data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (size_t i = 0; i < layer_size / data_T::size; i++) {
-            data_T ctype = data.read();
-            for (size_t j = 0; j < data_T::size; j++) {
-                out << float(ctype[j]) << " "; // We don't care about precision in text files
-            }
-            data.write(ctype);
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-#endif
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
-    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
-    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
-    std::copy(in_begin, in_end, dst);
-}
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
-void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
-    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
-    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
-
-    size_t i_pack = 0;
-    dst_T dst_pack;
-    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
-        dst_pack[i_pack++] = typename dst_T::value_type(*i);
-        if (i_pack == dst_T::size) {
-            i_pack = 0;
-            dst.write(dst_pack);
-        }
-    }
-}
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
-    for (auto i = 0; i < SIZE; i++)
-        if (i == SIZE - 1) {
-            dst[i].data = src[i];
-            dst[i].last = 1;
-        } else {
-            dst[i].data = src[i];
-            dst[i].last = 0;
-        }
-}
-
-template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
-    for (int i = 0; i < SIZE; i++) {
-        out << result[i] << " ";
-    }
-    out << std::endl;
-}
-
-template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
-    for (int i = 0; i < SIZE / res_T::size; i++) {
-        res_T res_pack = result.read();
-        for (int j = 0; j < res_T::size; j++) {
-            out << res_pack[j] << " ";
-        }
-        if (keep)
-            result.write(res_pack);
-    }
-    out << std::endl;
-}
-
-template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
-
-template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
-    for (int i = 0; i < SIZE / data_T::size; i++) {
-        data_T data_pack;
-        for (int j = 0; j < data_T::size; j++) {
-            data_pack[j] = 0.;
-        }
-        data.write(data_pack);
-    }
-}
-
-template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
-    FILE *fp;
-    fp = fopen(filename, "r");
-    if (fp == 0) {
-        return -1;
-    }
-    // Read data from file
-    float newval;
-    for (int ii = 0; ii < nrows; ii++) {
-        if (fscanf(fp, "%f\n", &newval) != 0) {
-            data[ii] = newval;
-        } else {
-            return -2;
-        }
-    }
-    fclose(fp);
-    return 0;
-}
-
-template <class dataType, unsigned int nrows, unsigned int ncols>
-int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
-    FILE *fp;
-    fp = fopen(filename, "r");
-    if (fp == 0) {
-        return -1;
-    }
-    // Read data from file
-    float newval;
-    for (int ii = 0; ii < nrows; ii++) {
-        for (int jj = 0; jj < ncols; jj++) {
-            if (fscanf(fp, "%f\n", &newval) != 0) {
-                data[ii][jj] = newval;
-            } else {
-                return -2;
-            }
-        }
-    }
-    fclose(fp);
-    return 0;
-}
-
-template <class in_T, class out_T, int N_IN> void change_type(hls::stream<in_T> &in, hls::stream<out_T> &out) {
-    in_T datareg;
-    hls::stream<out_T> input_trunc;
-    for (int ii = 0; ii < N_IN; ii++) {
-        out << (out_T)in.read();
-    }
-}
-
-template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &data, hls::stream<data_T> &res) {
-    data_T datareg;
-    for (int ii = 0; ii < N_IN; ii++) {
-        datareg = data.read();
-        std::cout << "[" << ii << "]: " << datareg << std::endl;
-        res << datareg;
-    }
-}
-
-constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
-
-constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
-
-constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include "hls_stream.h"
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace nnet {
+
+#ifndef __SYNTHESIS__
+
+#ifndef WEIGHTS_DIR
+#define WEIGHTS_DIR "weights"
+#endif
+
+template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+
+        size_t i = 0;
+        while (std::getline(iss, token, ',')) {
+            std::istringstream(token) >> w[i];
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].sign >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stream<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j]);
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = save_T(data[i]);
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(hls::stream<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = save_T(ctype[j]);
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << float(data[i]) << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(hls::stream<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << float(ctype[j]) << " "; // We don't care about precision in text files
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+#endif
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+    std::copy(in_begin, in_end, dst);
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
+void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+
+    size_t i_pack = 0;
+    dst_T dst_pack;
+    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
+        dst_pack[i_pack++] = typename dst_T::value_type(*i);
+        if (i_pack == dst_T::size) {
+            i_pack = 0;
+            dst.write(dst_pack);
+        }
+    }
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+    for (auto i = 0; i < SIZE; i++)
+        if (i == SIZE - 1) {
+            dst[i].data = src[i];
+            dst[i].last = 1;
+        } else {
+            dst[i].data = src[i];
+            dst[i].last = 0;
+        }
+}
+
+template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE; i++) {
+        out << result[i] << " ";
+    }
+    out << std::endl;
+}
+
+template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / res_T::size; i++) {
+        res_T res_pack = result.read();
+        for (int j = 0; j < res_T::size; j++) {
+            out << res_pack[j] << " ";
+        }
+        if (keep)
+            result.write(res_pack);
+    }
+    out << std::endl;
+}
+
+template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
+
+template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < data_T::size; j++) {
+            data_pack[j] = 0.;
+        }
+        data.write(data_pack);
+    }
+}
+
+template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        if (fscanf(fp, "%f\n", &newval) != 0) {
+            data[ii] = newval;
+        } else {
+            return -2;
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class dataType, unsigned int nrows, unsigned int ncols>
+int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        for (int jj = 0; jj < ncols; jj++) {
+            if (fscanf(fp, "%f\n", &newval) != 0) {
+                data[ii][jj] = newval;
+            } else {
+                return -2;
+            }
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class in_T, class out_T, int N_IN> void change_type(hls::stream<in_T> &in, hls::stream<out_T> &out) {
+    in_T datareg;
+    hls::stream<out_T> input_trunc;
+    for (int ii = 0; ii < N_IN; ii++) {
+        out << (out_T)in.read();
+    }
+}
+
+template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &data, hls::stream<data_T> &res) {
+    data_T datareg;
+    for (int ii = 0; ii < N_IN; ii++) {
+        datareg = data.read();
+        std::cout << "[" << ii << "]: " << datareg << std::endl;
+        res << datareg;
+    }
+}
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index ac94f22235..afed57802b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -1,330 +1,404 @@
-//
-//    rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
-//
-//    Copyright (C) 2017 EJ Kreinar
-//
-//    This program is free software: you can redistribute it and/or modify
-//    it under the terms of the GNU General Public License as published by
-//    the Free Software Foundation, either version 3 of the License, or
-//    (at your option) any later version.
-//
-//    This program is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU General Public License for more details.
-//
-//    You should have received a copy of the GNU General Public License
-//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-//
-
-#ifndef NNET_LAYERNORM_H_
-#define NNET_LAYERNORM_H_
-
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include "hls_stream.h"
-#include <math.h>
-#include <iostream>
-
-#include "hls_math.h"
-// #include "ap_fixed.h"
-
-namespace nnet {
-
-struct layernorm_config
-{
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 20;
-    static const unsigned seq_len = 4;
-    
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    
-    template<class x_T, class y_T>
-    using product = nnet::product::mult<x_T, y_T>;
-};
-
-template<typename CONFIG_T, int N_TABLE>
-void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
-    float inv_range = CONFIG_T::table_range;
-    // Inversion function:
-    //   result = 1/sqrt(x)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
-        float in_val = inv_range*ii/float(N_TABLE);
-        // Next, compute lookup table function
-        if (in_val > 0.0) table_out[ii] = 1.0/sqrt(in_val);
-        else table_out[ii] = 0.0;
-    }
-}
-
-template<typename CONFIG_T, int N_TABLE>
-void init_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
-    float inv_range = 0.5; /// if not acurrate increase this
-    // Inversion function:
-    //   result = 1/sqrt(x)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
-        float in_val = inv_range*ii/float(N_TABLE);
-        // Next, compute lookup table function
-        if (in_val > 0.0) table_out[ii] = sqrt(in_val);
-        else table_out[ii] = 0.0;
-    }
-}
-
-
-
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-// )
-// {
-// #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-// #pragma HLS ARRAY_PARTITION variable=data complete
-// #pragma HLS ARRAY_PARTITION variable=res complete
-
-// int inv_range_inv = (int) 1/ 0.5; 
-// typename CONFIG_T::table_t sqr = 0;
-// #ifdef __HLS_SYN__
-//     bool initialized = false;
-//     typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
-// #else
-//     static bool initialized = false;
-//     static typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
-// #endif
-//     if (!initialized) {
-//         init_sqr_table<CONFIG_T, CONFIG_T::table_size>(sqr_table);
-//         initialized = true;
-//     }
-
-//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//     data_T sum_cache = 0;
-//     data_T sum_cache2 = 0; 
-//     data_T var, mean, diff, inv_sqr;
-//     data_T data_diff[dim];
-//     data_T data_norm[dim];
-
-//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
-//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
-    
-//     const data_T k_inv = 1.0/dim;
-//     for (int i = 0; i < dim; ++i){
-//         sum_cache += data[i];
-//     }
-//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-    
-//     for (int i = 0; i < dim; ++i){
-//         data_diff[i] = data[i] - mean;
-//         diff = data_diff[i]*data_diff[i];
-//         sum_cache2 += diff;
-//     }
-//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-
-//     int index = var*(CONFIG_T::table_size)*inv_range_inv;
-// 	if (index < 0)   index = 0;
-// 	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-// 	sqr = (typename CONFIG_T::table_t) sqr_table[index];
-//     inv_sqr = 1 / sqr;
- 
-
-
-//     for (int i = 0; i < dim; ++i){
-//         res[i] = data_diff[i] * inv_sqr * scale[i] + bias[i];
-//     }
-
-// }
-
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void layernorm_1d(
-    data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-    res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-    typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-    typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-)
-{
-#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-#pragma HLS ARRAY_PARTITION variable=data complete
-#pragma HLS ARRAY_PARTITION variable=res complete
-
-int inv_range_inv = (int) 1/ CONFIG_T::table_range;
-typename CONFIG_T::table_t deno_inver = 0;
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
-        initialized = true;
-    }
-
-    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-    data_T sum_cache = 0;
-    data_T sum_cache2 = 0; 
-    data_T var, mean, diff;
-    data_T data_diff[dim];
-    data_T data_norm[dim];
-
-    #pragma HLS ARRAY_PARTITION variable=data_diff complete
-    #pragma HLS ARRAY_PARTITION variable=data_diff complete
-    
-    const data_T k_inv = 1.0/dim;
-    for (int i = 0; i < dim; ++i){
-        sum_cache += data[i];
-    }
-    mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-    // std::cout << "mean: " << std::endl;
-    // std::cout << mean << std::endl;
-    
-    for (int i = 0; i < dim; ++i){
-        data_diff[i] = data[i] - mean;
-        diff = data_diff[i]*data_diff[i];
-        sum_cache2 += diff;
-        // std::cout << "data_diff: " << std::endl;
-        // std::cout << data_diff[i] << std::endl;
-        // std::cout << " " << std::endl;
-    }
-    var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-    // std::cout << "var: " << std::endl;
-    // std::cout << var << std::endl;
-    // std::cout << " " << std::endl;
-
-    int index = var*(CONFIG_T::table_size)*inv_range_inv;
-    if (CONFIG_T::table_range > 1) index = var*(CONFIG_T::table_size)/ (int)CONFIG_T::table_range;
-
-	if (index < 0)   index = 0;
-	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-	deno_inver = (typename CONFIG_T::table_t) invert_sqr_table[index];
-    // std::cout << "deno_inver: " << std::endl;
-    // std::cout << deno_inver << std::endl;
-    // std::cout << " " << std::endl;
-
-
-    for (int i = 0; i < dim; ++i){
-        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
-    }
-
-}
-
-
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-// )
-// {
-// #pragma HLS PIPELINE
-// #pragma HLS ARRAY_PARTITION variable=data complete
-// #pragma HLS ARRAY_PARTITION variable=res complete
-
-
-//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//     data_T sum_cache = 0;
-//     data_T sum_cache2 = 0; 
-//     data_T var, mean, diff_squares, diff, var_eps_inv;
-//     data_T data_diff[dim];
-//     float sqrt_var_eps;
-
-//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
-    
-//     const data_T k_inv = 1.0/dim;
-//     for (int i = 0; i < dim; ++i){
-//         sum_cache += data[i];
-//     }
-//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-//     // std::cout << "mean: " << std::endl;
-//     // std::cout << mean << std::endl;
-    
-//     for (int i = 0; i < dim; ++i){
-//         diff = data[i] - mean;
-//         data_diff[i] = diff;
-//         diff_squares = diff*diff;
-//         sum_cache2 += diff_squares;
-//         // std::cout << "data_diff: " << std::endl;
-//         // std::cout << data_diff[i] << std::endl;
-//         // std::cout << " " << std::endl;
-//     }
-//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-//     float var_f = (float)var;
-//     // std::cout << "var: ";
-//     // std::cout << var << std::endl;
-
-//     sqrt_var_eps = sqrt(var_f);
-//     var_eps_inv = (data_T) (1 / (sqrt_var_eps));
-//     // std::cout << "var_eps_inv: " << std::endl;
-//     // std::cout << var_eps_inv << std::endl;
-//     // std::cout << " " << std::endl;
-
-
-//     for (int i = 0; i < dim; ++i){
-//         res[i] = data_diff[i] * var_eps_inv * scale[i] + bias[i];
-//     }
-
-// }
-
-
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void layernormalize(
-    data_T    data[CONFIG_T::n_in],
-    res_T     res[CONFIG_T::n_in],
-    typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-    typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-)
-{
-    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-    data_T in_val[dim];
-    data_T outval[dim];
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=scale,bias
-    
-    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    #pragma HLS ARRAY_PARTITION variable=scale complete
-    #pragma HLS ARRAY_PARTITION variable=bias complete
-    #pragma HLS ARRAY_PARTITION variable=in_val complete
-    #pragma HLS ARRAY_PARTITION variable=outval complete
-
-    // std::cout << "one seq norm layer: " << std::endl;
-    // std::cout << " " << std::endl;
-    
-    for (int j=0; j <CONFIG_T::seq_len; ++j){
-    #pragma HLS PIPELINE
-        load: for (int i=0; i < dim; ++i){
-        #pragma HLS UNROLL
-            in_val[i] = data[j*dim+i];
-        }
-        layernorm_1d<data_T, res_T, CONFIG_T>(in_val, outval, scale, bias);
-        store: for (int i=0; i < dim; ++i){
-        #pragma HLS UNROLL
-            res[j*dim+i] = outval[i];
-        }
-    }
-
-    // std::cout << "out Norm: " << std::endl;
-    // nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
-    // std::cout << " " << std::endl;
-
-}
-
-}
-
-#endif
+//
+//    rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
+//
+//    Copyright (C) 2017 EJ Kreinar
+//
+//    This program is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    This program is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU General Public License for more details.
+//
+//    You should have received a copy of the GNU General Public License
+//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef NNET_LAYERNORM_H_
+#define NNET_LAYERNORM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <iostream>
+#include <math.h>
+
+#include "hls_math.h"
+// #include "ap_fixed.h"
+
+namespace nnet {
+
+struct layernorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+    typedef ap_fixed<16, 8> mean_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 20;
+    static const unsigned seq_len = 4;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    float inv_range = CONFIG_T::table_range;
+    // Inversion function:
+    //   result = 1/sqrt(x)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
+        float in_val = inv_range * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / sqrt(in_val);
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    float inv_range = 0.5; /// if not acurrate increase this
+    // Inversion function:
+    //   result = 1/sqrt(x)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
+        float in_val = inv_range * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0)
+            table_out[ii] = sqrt(in_val);
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+// )
+// {
+// #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+// #pragma HLS ARRAY_PARTITION variable=data complete
+// #pragma HLS ARRAY_PARTITION variable=res complete
+
+// int inv_range_inv = (int) 1/ 0.5;
+// typename CONFIG_T::table_t sqr = 0;
+// #ifdef __HLS_SYN__
+//     bool initialized = false;
+//     typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
+// #else
+//     static bool initialized = false;
+//     static typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
+// #endif
+//     if (!initialized) {
+//         init_sqr_table<CONFIG_T, CONFIG_T::table_size>(sqr_table);
+//         initialized = true;
+//     }
+
+//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//     data_T sum_cache = 0;
+//     data_T sum_cache2 = 0;
+//     data_T var, mean, diff, inv_sqr;
+//     data_T data_diff[dim];
+//     data_T data_norm[dim];
+
+//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
+//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
+
+//     const data_T k_inv = 1.0/dim;
+//     for (int i = 0; i < dim; ++i){
+//         sum_cache += data[i];
+//     }
+//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+
+//     for (int i = 0; i < dim; ++i){
+//         data_diff[i] = data[i] - mean;
+//         diff = data_diff[i]*data_diff[i];
+//         sum_cache2 += diff;
+//     }
+//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+
+//     int index = var*(CONFIG_T::table_size)*inv_range_inv;
+// 	if (index < 0)   index = 0;
+// 	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+// 	sqr = (typename CONFIG_T::table_t) sqr_table[index];
+//     inv_sqr = 1 / sqr;
+
+//     for (int i = 0; i < dim; ++i){
+//         res[i] = data_diff[i] * inv_sqr * scale[i] + bias[i];
+//     }
+
+// }
+
+//////////////////////
+// Dennis's version //
+//////////////////////
+template <class data_T, class res_T, typename CONFIG_T>
+void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CONFIG_T::n_in / CONFIG_T::seq_len],
+                  typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
+                  typename CONFIG_T::bias_t bias[CONFIG_T::n_in / CONFIG_T::seq_len]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    #pragma HLS ARRAY_PARTITION variable=data complete
+    #pragma HLS ARRAY_PARTITION variable=res complete
+    int inv_range_inv = (int)1 / CONFIG_T::table_range;
+    typename CONFIG_T::table_t deno_inver = 0;
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
+        initialized = true;
+    }
+
+    static const unsigned dim = CONFIG_T::n_in / CONFIG_T::seq_len;
+    typename CONFIG_T::mean_t sum_cache = 0;
+    typename CONFIG_T::mean_t sum_cache2 = 0;
+    typename CONFIG_T::mean_t var, mean, diff;
+    typename CONFIG_T::mean_t data_diff[dim];
+    typename CONFIG_T::mean_t data_norm[dim];
+    //    data_T sum_cache = 0;
+    //    data_T sum_cache2 = 0;
+    //    data_T var, mean, diff;
+    ////    typename CONFIG_T::mean_t mean;
+    ////    typename CONFIG_T::var_t var;
+    ////    typename CONFIG_T::diff_t diff;
+    //    data_T data_diff[dim];
+    //    data_T data_norm[dim];
+
+    #pragma HLS ARRAY_PARTITION variable=data_diff complete
+    #pragma HLS ARRAY_PARTITION variable=data_diff complete
+
+    const typename CONFIG_T::mean_t k_inv = 1.0 / dim;
+    for (int i = 0; i < dim; ++i) {
+        sum_cache += static_cast<typename CONFIG_T::mean_t>(data[i]);
+    }
+    mean = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache, k_inv);
+    //    std::cout << "mean: " << std::endl;
+    //    std::cout << mean << std::endl;
+
+    for (int i = 0; i < dim; ++i) {
+        data_diff[i] = static_cast<typename CONFIG_T::mean_t>(data[i]) - mean;
+        diff = data_diff[i] * data_diff[i];
+        sum_cache2 += diff;
+        //        std::cout << "data_diff: " << std::endl;
+        //        std::cout << data_diff[i] << std::endl;
+        //        std::cout << " " << std::endl;
+    }
+    var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
+    //    std::cout << "var: " << std::endl;
+    //    std::cout << var << std::endl;
+    //    std::cout << " " << std::endl;
+
+    int index = var * (CONFIG_T::table_size)*inv_range_inv;
+    if (CONFIG_T::table_range > 1)
+        index = var * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
+
+    if (index < 0)
+        index = 0;
+    if (index > CONFIG_T::table_size - 1)
+        index = CONFIG_T::table_size - 1;
+    deno_inver = (typename CONFIG_T::table_t)invert_sqr_table[index];
+    //    std::cout << "deno_inver: " << std::endl;
+    //    std::cout << deno_inver << std::endl;
+    //    std::cout << " " << std::endl;
+
+    //    std::cout << "index: " << std::endl;
+    //    std::cout << index << std::endl;
+    //    std::cout << " " << std::endl;
+
+    for (int i = 0; i < dim; ++i) {
+        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
+    }
+}
+////////////////////////
+// Original One Ethan's//
+////////////////////////
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+//)
+//{
+//#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+//#pragma HLS ARRAY_PARTITION variable=data complete
+//#pragma HLS ARRAY_PARTITION variable=res complete
+//
+// int inv_range_inv = (int) 1/ CONFIG_T::table_range;
+// typename CONFIG_T::table_t deno_inver = 0;
+//#ifdef __HLS_SYN__
+//    bool initialized = false;
+//    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+//#else
+//    static bool initialized = false;
+//    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+//#endif
+//    if (!initialized) {
+//        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
+//        initialized = true;
+//    }
+//
+//    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//    data_T sum_cache = 0;
+//    data_T sum_cache2 = 0;
+//    data_T var, mean, diff;
+//    data_T data_diff[dim];
+//    data_T data_norm[dim];
+//
+//    #pragma HLS ARRAY_PARTITION variable=data_diff complete
+//    #pragma HLS ARRAY_PARTITION variable=data_diff complete
+//
+//    const data_T k_inv = 1.0/dim;
+//    for (int i = 0; i < dim; ++i){
+//        sum_cache += data[i];
+//    }
+////    mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+////    std::cout << "mean: " << std::endl;
+////    std::cout << mean << std::endl;
+//
+//    for (int i = 0; i < dim; ++i){
+//        data_diff[i] = data[i] - mean;
+//        diff = data_diff[i]*data_diff[i];
+//        sum_cache2 += diff;
+////        std::cout << "data_diff: " << std::endl;
+////        std::cout << data_diff[i] << std::endl;
+////        std::cout << " " << std::endl;
+//    }
+//    var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+////    std::cout << "var: " << std::endl;
+////    std::cout << var << std::endl;
+////    std::cout << " " << std::endl;
+//
+//    int index = var*(CONFIG_T::table_size)*inv_range_inv;
+//    if (CONFIG_T::table_range > 1) index = var*(CONFIG_T::table_size)/ (int)CONFIG_T::table_range;
+//
+//	if (index < 0)   index = 0;
+//	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+//	deno_inver = (typename CONFIG_T::table_t) invert_sqr_table[index];
+////    std::cout << "deno_inver: " << std::endl;
+////    std::cout << deno_inver << std::endl;
+////    std::cout << " " << std::endl;
+//
+////    std::cout << "index: " << std::endl;
+////    std::cout << index << std::endl;
+////    std::cout << " " << std::endl;
+//
+//    for (int i = 0; i < dim; ++i){
+//        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
+//    }
+//
+//}
+
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+// )
+// {
+// #pragma HLS PIPELINE
+// #pragma HLS ARRAY_PARTITION variable=data complete
+// #pragma HLS ARRAY_PARTITION variable=res complete
+
+//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//     data_T sum_cache = 0;
+//     data_T sum_cache2 = 0;
+//     data_T var, mean, diff_squares, diff, var_eps_inv;
+//     data_T data_diff[dim];
+//     float sqrt_var_eps;
+
+//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
+
+//     const data_T k_inv = 1.0/dim;
+//     for (int i = 0; i < dim; ++i){
+//         sum_cache += data[i];
+//     }
+//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+//     // std::cout << "mean: " << std::endl;
+//     // std::cout << mean << std::endl;
+
+//     for (int i = 0; i < dim; ++i){
+//         diff = data[i] - mean;
+//         data_diff[i] = diff;
+//         diff_squares = diff*diff;
+//         sum_cache2 += diff_squares;
+//         // std::cout << "data_diff: " << std::endl;
+//         // std::cout << data_diff[i] << std::endl;
+//         // std::cout << " " << std::endl;
+//     }
+//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+//     float var_f = (float)var;
+//     // std::cout << "var: ";
+//     // std::cout << var << std::endl;
+
+//     sqrt_var_eps = sqrt(var_f);
+//     var_eps_inv = (data_T) (1 / (sqrt_var_eps));
+//     // std::cout << "var_eps_inv: " << std::endl;
+//     // std::cout << var_eps_inv << std::endl;
+//     // std::cout << " " << std::endl;
+
+//     for (int i = 0; i < dim; ++i){
+//         res[i] = data_diff[i] * var_eps_inv * scale[i] + bias[i];
+//     }
+
+// }
+
+template <class data_T, class res_T, typename CONFIG_T>
+void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+                    typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
+                    typename CONFIG_T::bias_t bias[CONFIG_T::n_in / CONFIG_T::seq_len]) {
+    static const unsigned dim = CONFIG_T::n_in / CONFIG_T::seq_len;
+    data_T in_val[dim];
+    data_T outval[dim];
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=scale,bias
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+    #pragma HLS ARRAY_PARTITION variable=in_val complete
+    #pragma HLS ARRAY_PARTITION variable=outval complete
+
+    // std::cout << "one seq norm layer: " << std::endl;
+    // std::cout << " " << std::endl;
+
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        #pragma HLS PIPELINE
+    load:
+        for (int i = 0; i < dim; ++i) {
+            #pragma HLS UNROLL
+            in_val[i] = data[j * dim + i];
+        }
+        layernorm_1d<data_T, res_T, CONFIG_T>(in_val, outval, scale, bias);
+    store:
+        for (int i = 0; i < dim; ++i) {
+            #pragma HLS UNROLL
+            res[j * dim + i] = outval[i];
+        }
+    }
+
+    //     std::cout << "out Dense: " << std::endl;
+    //     nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
+    //     std::cout << " " << std::endl;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
index 8005682978..e0c5cb4e27 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
@@ -1,256 +1,256 @@
-#ifndef NNET_MERGE_H_
-#define NNET_MERGE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-struct merge_config {
-    static const unsigned n_elem = 10;
-};
-
-struct dot_config {
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 1;
-    static const unsigned reuse_factor = 1;
-    typedef float accum_t;
-    // Product function to use
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-struct concat_config {
-    static const unsigned n_elem1_0 = 10;
-    static const unsigned n_elem1_1 = 10;
-    static const unsigned n_elem1_2 = 10;
-    static const unsigned n_elem2_0 = 10;
-    static const unsigned n_elem2_1 = 10;
-    static const unsigned n_elem2_2 = 10;
-
-    static const unsigned axis = -1;
-};
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] + data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] - data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] * data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
-    #pragma HLS ARRAY_PARTITION variable=mult complete
-    typename CONFIG_T::accum_t acc = 0;
-
-Product:
-    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
-        #pragma HLS UNROLL
-        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
-    }
-
-Accum:
-    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
-        #pragma HLS UNROLL
-        acc += mult[i_acc];
-    }
-
-    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
-                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
-        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
-        }
-        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
-            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
-                data2[ii * CONFIG_T::n_elem2_1 + jj];
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma HLS INLINE
-
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx =
-                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                res[res_idx] = data1[data_idx];
-            }
-        }
-        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
-                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
-                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
-                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
-                res[res_idx] = data2[data_idx];
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
-                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                res[res_idx] = data1[data_idx];
-            }
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
-                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
-                res[res_idx] = data2[data_idx];
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS INLINE
-
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+    static const unsigned reuse_factor = 1;
+    typedef float accum_t;
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] + data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] - data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] * data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+    typename CONFIG_T::accum_t acc = 0;
+
+Product:
+    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
+        #pragma HLS UNROLL
+        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
+    }
+
+Accum:
+    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
+        #pragma HLS UNROLL
+        acc += mult[i_acc];
+    }
+
+    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
+        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
+                data2[ii * CONFIG_T::n_elem2_1 + jj];
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx =
+                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
+                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
index a57ec78e35..17cf4fe99c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
@@ -1,370 +1,370 @@
-#ifndef NNET_MERGE_STREAM_H_
-#define NNET_MERGE_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-AddLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    AddPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = in_data1[j] + in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-SubtractLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    SubtractPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = in_data1[j] - in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MultiplyLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MultiplyPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = in_data1[j] * in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-AverageLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    AveragePack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MaximumLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MaximumPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MinimumLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MinimumPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth1:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-ConcatLoopHeight2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-    ConcatLoopWidth2:
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth1:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-            res.write(out_data);
-        }
-    ConcatLoopWidth2:
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[input1_T::size + k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        #pragma HLS PIPELINE II=1
-
-        input1_T in_data1 = data1.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput1:
-        for (int k = 0; k < input1_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[k] = in_data1[k];
-        }
-
-        res.write(out_data);
-    }
-ConcatLoopHeight2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        #pragma HLS PIPELINE II=1
-
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput2:
-        for (int k = 0; k < input2_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[k] = in_data2[k];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        #pragma HLS PIPELINE II=1
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput1:
-        for (int k = 0; k < input1_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[k] = in_data1[k];
-        }
-
-    ConcatPackInput2:
-        for (int k = 0; k < input2_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[input1_T::size + k] = in_data2[k];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    res_T out_data;
-    PRAGMA_DATA_PACK(out_data)
-ConcatLoop1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
-        #pragma HLS PIPELINE
-        input1_T in_data1 = data1.read();
-    ConcatPack1:
-        for (int j = 0; j < input1_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j + (i * input1_T::size)] = in_data1[j];
-        }
-    }
-ConcatLoop2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
-        #pragma HLS PIPELINE
-        input2_T in_data2 = data2.read();
-    ConcatPack2:
-        for (int j = 0; j < input2_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j];
-        }
-    }
-    res.write(out_data);
-}
-} // namespace nnet
-
-#endif
+#ifndef NNET_MERGE_STREAM_H_
+#define NNET_MERGE_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AddLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AddPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] + in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+SubtractLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SubtractPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] - in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MultiplyLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MultiplyPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] * in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AverageLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AveragePack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MaximumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MaximumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MinimumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MinimumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[input1_T::size + k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+        res.write(out_data);
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[input1_T::size + k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    res_T out_data;
+    PRAGMA_DATA_PACK(out_data)
+ConcatLoop1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+        input1_T in_data1 = data1.read();
+    ConcatPack1:
+        for (int j = 0; j < input1_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j + (i * input1_T::size)] = in_data1[j];
+        }
+    }
+ConcatLoop2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
+        #pragma HLS PIPELINE
+        input2_T in_data2 = data2.read();
+    ConcatPack2:
+        for (int j = 0; j < input2_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j];
+        }
+    }
+    res.write(out_data);
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
index 00d1c6d12b..edf8f739b9 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
@@ -1,116 +1,116 @@
-#ifndef NNET_MULT_H_
-#define NNET_MULT_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <iostream>
-#include <math.h>
-
-namespace nnet {
-
-namespace product {
-
-/* ---
- * different methods to perform the product of input and weight, depending on the
- * types of each.
- * --- */
-
-class Product {};
-
-template <class x_T, class w_T> class both_binary : public Product {
-  public:
-    static x_T product(x_T a, w_T w) {
-        // specialisation for 1-bit weights and incoming data
-        #pragma HLS INLINE
-        return a == w;
-    }
-};
-
-template <class x_T, class w_T> class weight_binary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 1-bit weights, arbitrary data
-        #pragma HLS INLINE
-        if (w == 0)
-            return -a;
-        else
-            return a;
-    }
-};
-
-template <class x_T, class w_T> class data_binary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-w) {
-        // Specialisation for 1-bit data, arbitrary weight
-        #pragma HLS INLINE
-        if (a == 0)
-            return -w;
-        else
-            return w;
-    }
-};
-
-template <class x_T, class w_T> class weight_ternary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 2-bit weights, arbitrary data
-        #pragma HLS INLINE
-        if (w == 0)
-            return 0;
-        else if (w == -1)
-            return -a;
-        else
-            return a; // if(w == 1)
-    }
-};
-
-template <class x_T, class w_T> class mult : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(a * w) {
-        // 'Normal' product
-        #pragma HLS INLINE
-        return a * w;
-    }
-};
-
-template <class x_T, class w_T> class weight_exponential : public Product {
-  public:
-    using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
-    static r_T product(x_T a, w_T w) {
-        // Shift product for exponential weights
-        #pragma HLS INLINE
-
-        // Shift by the exponent. Negative weights shift right
-        r_T y = static_cast<r_T>(a) << w.weight;
-
-        // Negate or not depending on weight sign
-        return w.sign == 1 ? y : static_cast<r_T>(-y);
-    }
-};
-
-} // namespace product
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ap_uint<1>>::value &&
-                                   std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value,
-                               ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
-cast(typename CONFIG_T::accum_t x) {
-    return (ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<
-    std::is_same<data_T, ap_uint<1>>::value && !std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value, res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return (res_T)x;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<(!std::is_same<data_T, ap_uint<1>>::value), res_T>::type cast(typename CONFIG_T::accum_t x) {
-    return (res_T)x;
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+namespace product {
+
+/* ---
+ * different methods to perform the product of input and weight, depending on the
+ * types of each.
+ * --- */
+
+class Product {};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        #pragma HLS INLINE
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        #pragma HLS INLINE
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        #pragma HLS INLINE
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        #pragma HLS INLINE
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        #pragma HLS INLINE
+        return a * w;
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
+    static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        #pragma HLS INLINE
+
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+
+} // namespace product
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ap_uint<1>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value,
+                               ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<
+    std::is_same<data_T, ap_uint<1>>::value && !std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value, res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ap_uint<1>>::value), res_T>::type cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 8548e1125b..4c42c69b67 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -1,346 +1,337 @@
-#ifndef NNET_MHT_H_
-#define NNET_MHT_H_
-
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include "nnet_dense.h"
-#include "nnet_activation.h"
-#include "hls_stream.h"
-#include <iostream>
-#include <math.h>
-
-namespace nnet {
-
-struct multiheadattention_config
-{
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned num_heads = 10;
-    static const unsigned head_dim_key = 10;
-    static const unsigned head_dim_value = 10;
-    static const unsigned feature_dim = 20;
-    static const unsigned seq_len = 500;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned strategy = latency; 
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    
-    template<class x_T, class y_T>
-    using product = nnet::product::mult<x_T, y_T>;
-};
-
-template<int PackSize, class data_T>
-struct datapack {
-    data_T data[PackSize];
-};
-
-
-template <class data_T,int size>
-void read_stream_array(
-	hls::stream<data_T>    data_in[size],
-	data_T out[size]
-)
-{
-	for (int k=0; k<size; ++k){
-	#pragma HLS UNROLL
-		out[k] = data_in[k].read();
-	}
-}
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void matrixmul_transpose(
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
-    res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
-{
-    const data_T dk = 1.0/sqrt(CONFIG_T::head_dim_key);
-    data_T QK_1;
-	typename CONFIG_T::accum_t QKij;
-    data_T Qi[CONFIG_T::head_dim_key];
-    data_T Product[CONFIG_T::seq_len];// seq_Q, seq_K
-    data_T qk_smout[CONFIG_T::seq_len];
-    data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
-	#pragma HLS ARRAY_PARTITION variable=Qi complete
-	#pragma HLS ARRAY_PARTITION variable=Product complete
-	#pragma HLS ARRAY_PARTITION variable=qk_smout complete
-	#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
-	#pragma HLS ARRAY_PARTITION variable=krow complete
-
-	datapack<CONFIG_T::head_dim_key, data_T> datak_pack, dataq_pack;
-	#pragma HLS DATA_PACK variable=Q
-	#pragma HLS DATA_PACK variable=K
-	#pragma HLS DATA_PACK variable=datak_pack
-	#pragma HLS DATA_PACK variable=dataq_pack
-
-    int multiplier_limit  = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
-
-    prep_k: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
-	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    	datak_pack = K.read();
-    	for(int j = 0; j < CONFIG_T::head_dim_key; ++j) {
-		#pragma HLS UNROLL
-    		krow[i*CONFIG_T::head_dim_key + j] = datak_pack.data[j];
-    	}
-    }
-
-    // for each row and column of AB
-    row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
-	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    	dataq_pack = Q.read();
-
-    	q: for(int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i){
-		#pragma HLS UNROLL
-    		Qi[q_i]=dataq_pack.data[q_i];
-    	}
-        col: for(int j = 0; j < CONFIG_T::seq_len; ++j) {
-            // compute (QK)i,j
-            QKij = 0;
-            product: for(int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-            	QK_1 = CONFIG_T::template product<data_T, data_T>::product(Qi[k],krow[j*CONFIG_T::head_dim_key + k]);
-                QKij += QK_1;
-            }
-            Product[j] = QKij * dk;
-        }
-        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, qk_smout);
-        for(int n = 0; n < CONFIG_T::seq_len; ++n) {
-		#pragma HLS UNROLL
-        	QK[i][n]=qk_smout[n];
-        }
-    }
-}
-
-/////////
-template<class data_T, class res_T, typename CONFIG_T>
-void matrixmul(
-    data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len],
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>>  &V,
-	hls::stream<data_T>  S[CONFIG_T::head_dim_value]) // S: attention score
-{
-	#pragma HLS DATA_PACK variable=V
-	#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
-	#pragma HLS ARRAY_PARTITION variable=S complete dim=1
-
-	datapack<CONFIG_T::head_dim_key, data_T> datav_pack;
-	#pragma HLS DATA_PACK variable=datav_pack
-
-	int multiplier_limit  = ceil(float(CONFIG_T::seq_len*CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
-	CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
-
-	data_T dataV[CONFIG_T::seq_len*CONFIG_T::head_dim_value];
-	# pragma HLS ARRAY_PARTITION variable=dataV complete dim=1
-
-	for (int j=0; j <CONFIG_T::seq_len; ++j){
-	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-		datav_pack = V.read();
-		for (int i=0; i <CONFIG_T::head_dim_value; ++i){
-		#pragma HLS UNROLL
-			dataV[CONFIG_T::seq_len*i+j]=datav_pack.data[i];
-		}
-	}
-
-    // for each row and column of AB
-    data_T Sij, S_1;
-    data_T QKi[CONFIG_T::seq_len];
-	#pragma HLS ARRAY_Partition variable=QKi complete
-    row: for(int i = 0; i < CONFIG_T::seq_len; ++i) {
-	#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    	qk: for(int q_i = 0; q_i < CONFIG_T::seq_len; ++q_i){
-		#pragma HLS UNROLL
-			QKi[q_i]=QK[i][q_i];
-		}
-        col: for(int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-            // compute (S)i,j
-            Sij = 0;
-            product: for(int k = 0; k < CONFIG_T::seq_len; ++k) {
-            	S_1 = CONFIG_T::template product<data_T, data_T>::product(QKi[k], dataV[j*CONFIG_T::seq_len + k]);
-            	Sij += S_1;
-            }
-            S[j].write(Sij);
-        }
-    }
-}
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void lin_projection(
-	hls::stream<data_T>    data_q[CONFIG_T::feature_dim],
-	hls::stream<data_T>    data_vk[CONFIG_T::feature_dim],
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>>    &k_proj,
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>>    &q_proj,
-	hls::stream<datapack<CONFIG_T::head_dim_value, data_T>>  &v_proj,
-    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-    typename CONFIG_T::bias_t    key_bias[CONFIG_T::head_dim_key],
-	typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-	typename CONFIG_T::bias_t    query_bias[CONFIG_T::head_dim_key],
-	typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
-	typename CONFIG_T::bias_t    value_bias[CONFIG_T::head_dim_value]
-	)
-
-{
-	#pragma HLS DATA_PACK variable=k_proj
-	#pragma HLS DATA_PACK variable=q_proj
-	#pragma HLS DATA_PACK variable=v_proj
-
-	#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=data_vk complete dim=1
-
-    k_h: for (int j=0; j <CONFIG_T::seq_len; ++j){
-	#pragma HLS PIPELINE
-
-    	data_T proj_k[CONFIG_T::head_dim_key];
-    	data_T proj_q[CONFIG_T::head_dim_key];
-    	data_T proj_v[CONFIG_T::head_dim_value];
-    	data_T in_q[CONFIG_T::feature_dim];
-		data_T in_v[CONFIG_T::feature_dim];
-		#pragma HLS ARRAY_PARTITION variable=proj_k complete dim=1
-		#pragma HLS ARRAY_PARTITION variable=proj_q complete dim=1
-		#pragma HLS ARRAY_PARTITION variable=proj_v complete dim=1
-		#pragma HLS ARRAY_PARTITION variable=in_q complete dim=1
-		#pragma HLS ARRAY_PARTITION variable=in_v complete dim=1
-
-
-		datapack<CONFIG_T::head_dim_key, data_T> proj_k_pack;
-		datapack<CONFIG_T::head_dim_key, data_T> proj_q_pack;
-		datapack<CONFIG_T::head_dim_value, data_T> proj_v_pack;
-		#pragma HLS DATA_PACK variable=proj_k_pack
-		#pragma HLS DATA_PACK variable=proj_q_pack
-		#pragma HLS DATA_PACK variable=proj_v_pack
-
-		read_stream_array<data_T, CONFIG_T::feature_dim>(data_q, in_q);
-		read_stream_array<data_T, CONFIG_T::feature_dim>(data_vk, in_v);
-
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_k_pack.data, key_weight, key_bias);
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_q, proj_q_pack.data, query_weight, query_bias);
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_v_pack.data, value_weight, value_bias);
-
-        k_proj.write(proj_k_pack);
-		q_proj.write(proj_q_pack);
-		v_proj.write(proj_v_pack);
-    }
-}
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void dense_out(
-	hls::stream<data_T> data_in[CONFIG_T::num_heads][CONFIG_T::head_dim_value],
-	res_T     res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],
-    typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::feature_dim])
-{
-	data_T mat_res_con[CONFIG_T::num_heads*CONFIG_T::head_dim_value];
-	res_T dense_out[CONFIG_T::feature_dim];
-	#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=dense_out complete dim=1
-	output_dense: for (int k=0; k <CONFIG_T::seq_len; ++k){
-
-	#pragma HLS PIPELINE
-		for (int i=0;i<CONFIG_T::num_heads; ++i){
-		#pragma HLS UNROLL
-			for (int j=0;j<CONFIG_T::head_dim_value; ++j){
-			#pragma HLS UNROLL
-				mat_res_con[CONFIG_T::head_dim_value*i+j]=data_in[i][j].read();
-			}
-		}
-		dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, dense_out, attention_output_weight, attention_output_bias);
-		for (int i=0;i<CONFIG_T::feature_dim; ++i){
-		#pragma HLS UNROLL
-			res[CONFIG_T::feature_dim*k+i] = dense_out[i];
-		}
-	}
-}
-
-template<class data_T, class res_T, typename CONFIG_T>
-void data_prep(
-	data_T    data[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-	hls::stream<data_T> d[CONFIG_T::feature_dim])
-{
-	#pragma HLS ARRAY_PARTITION variable=d complete dim=1
-	for (int j=0; j<CONFIG_T::seq_len; ++j){
-		for (int k=0; k<CONFIG_T::feature_dim; ++k){
-		#pragma HLS UNROLL
-		d[k].write(data[j*CONFIG_T::feature_dim + k]);
-		}
-	}
-}
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void multiheadattention(
-    data_T    data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    data_T    data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    res_T     res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t  attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],  // num_heads,head_size_v,dim
-    typename CONFIG_T::bias_t    attention_output_bias[CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t  key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key],  // n_head,dim,head_dim
-    typename CONFIG_T::bias_t    key_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t  query_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], //same shape as key
-    typename CONFIG_T::bias_t    query_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t  value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
-    typename CONFIG_T::bias_t    value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value])
-{
-	hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-	hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> q_proj[CONFIG_T::num_heads];
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> k_proj[CONFIG_T::num_heads];
-	hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> v_proj[CONFIG_T::num_heads];
-    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
-    hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
-
-	#pragma HLS DATAFLOW
-	#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
-	#pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
-    // std::cout << "input to MHA: " << std::endl;
-    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
-    // std::cout << " " << std::endl;
-
-    prepq: for (int i=0;i<CONFIG_T::num_heads; ++i){
-		#pragma HLS UNROLL
-    	nnet::data_prep<data_T, res_T, CONFIG_T>(data_q, d_query[i]);
-    }
-	prepvk: for (int i=0;i<CONFIG_T::num_heads; ++i){
-		#pragma HLS UNROLL
-    	nnet::data_prep<data_T, res_T, CONFIG_T>(data_vk, d_value[i]);
-    }
-
-    // linear projection
-    lin_proj: for (int i=0;i<CONFIG_T::num_heads; ++i){
-    	#pragma HLS UNROLL
-    	nnet::lin_projection<data_T, res_T, CONFIG_T>(
-    			d_query[i], d_value[i],
-    			k_proj[i], q_proj[i], v_proj[i],
-				key_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), key_bias+(CONFIG_T::head_dim_key*i),
-				query_weight+(CONFIG_T::head_dim_key*CONFIG_T::feature_dim*i), query_bias+(CONFIG_T::head_dim_key*i),
-				value_weight+(CONFIG_T::head_dim_value*CONFIG_T::feature_dim*i), value_bias+(CONFIG_T::head_dim_value*i));
-    }
-
-    maxtrixmul1: for (int i=0; i < CONFIG_T::num_heads; ++i){
-	#pragma HLS UNROLL
-    	nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
-    }
-
-    maxtrixmul2: for (int i=0; i < CONFIG_T::num_heads; ++i){
-	#pragma HLS UNROLL
-    	nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]);//stream
-    }
-
-    nnet::dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
-	// std::cout << "out MHA: " << std::endl;
-    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(res, std::cout);
-    // std::cout << " " << std::endl;
-
-}
-}
-
-#endif
+#ifndef NNET_MHT_H_
+#define NNET_MHT_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_mult.h"
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+struct multiheadattention_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned num_heads = 10;
+    static const unsigned head_dim_key = 10;
+    static const unsigned head_dim_value = 10;
+    static const unsigned feature_dim = 20;
+    static const unsigned seq_len = 500;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <int PackSize, class data_T> struct datapack { data_T data[PackSize]; };
+
+template <class data_T, int size> void read_stream_array(hls::stream<data_T> data_in[size], data_T out[size]) {
+    for (int k = 0; k < size; ++k) {
+        #pragma HLS UNROLL
+        out[k] = data_in[k].read();
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void matrixmul_transpose(hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
+                         hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
+                         res_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
+{
+    const data_T dk = 1.0 / sqrt(CONFIG_T::head_dim_key);
+    data_T QK_1;
+    typename CONFIG_T::accum_t QKij;
+    data_T Qi[CONFIG_T::head_dim_key];
+    data_T Product[CONFIG_T::seq_len]; // seq_Q, seq_K
+    data_T qk_smout[CONFIG_T::seq_len];
+    data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
+    #pragma HLS ARRAY_PARTITION variable=Qi complete
+    #pragma HLS ARRAY_PARTITION variable=Product complete
+    #pragma HLS ARRAY_PARTITION variable=qk_smout complete
+    #pragma HLS ARRAY_PARTITION variable=QK complete dim=2
+    #pragma HLS ARRAY_PARTITION variable=krow complete
+
+    datapack<CONFIG_T::head_dim_key, data_T> datak_pack, dataq_pack;
+    #pragma HLS DATA_PACK variable=Q
+    #pragma HLS DATA_PACK variable=K
+    #pragma HLS DATA_PACK variable=datak_pack
+    #pragma HLS DATA_PACK variable=dataq_pack
+
+    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
+    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+
+prep_k:
+    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        datak_pack = K.read();
+        for (int j = 0; j < CONFIG_T::head_dim_key; ++j) {
+            #pragma HLS UNROLL
+            krow[i * CONFIG_T::head_dim_key + j] = datak_pack.data[j];
+        }
+    }
+
+// for each row and column of AB
+row:
+    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        dataq_pack = Q.read();
+
+    q:
+        for (int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i) {
+            #pragma HLS UNROLL
+            Qi[q_i] = dataq_pack.data[q_i];
+        }
+    col:
+        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+            // compute (QK)i,j
+            QKij = 0;
+        product:
+            for (int k = 0; k < CONFIG_T::head_dim_key; ++k) {
+                QK_1 = CONFIG_T::template product<data_T, data_T>::product(Qi[k], krow[j * CONFIG_T::head_dim_key + k]);
+                QKij += QK_1;
+            }
+            Product[j] = QKij * dk;
+        }
+        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, qk_smout);
+        for (int n = 0; n < CONFIG_T::seq_len; ++n) {
+            #pragma HLS UNROLL
+            QK[i][n] = qk_smout[n];
+        }
+    }
+}
+
+/////////
+template <class data_T, class res_T, typename CONFIG_T>
+void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &V,
+               hls::stream<data_T> S[CONFIG_T::head_dim_value]) // S: attention score
+{
+    #pragma HLS DATA_PACK variable=V
+    #pragma HLS ARRAY_PARTITION variable=QK complete dim=2
+    #pragma HLS ARRAY_PARTITION variable=S complete dim=1
+
+    datapack<CONFIG_T::head_dim_key, data_T> datav_pack;
+    #pragma HLS DATA_PACK variable=datav_pack
+
+    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
+    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+
+    data_T dataV[CONFIG_T::seq_len * CONFIG_T::head_dim_value];
+    #pragma HLS ARRAY_PARTITION variable = dataV complete dim = 1
+
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        datav_pack = V.read();
+        for (int i = 0; i < CONFIG_T::head_dim_value; ++i) {
+            #pragma HLS UNROLL
+            dataV[CONFIG_T::seq_len * i + j] = datav_pack.data[i];
+        }
+    }
+
+    // for each row and column of AB
+    data_T Sij, S_1;
+    data_T QKi[CONFIG_T::seq_len];
+    #pragma HLS ARRAY_Partition variable=QKi complete
+row:
+    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    qk:
+        for (int q_i = 0; q_i < CONFIG_T::seq_len; ++q_i) {
+            #pragma HLS UNROLL
+            QKi[q_i] = QK[i][q_i];
+        }
+    col:
+        for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
+            // compute (S)i,j
+            Sij = 0;
+        product:
+            for (int k = 0; k < CONFIG_T::seq_len; ++k) {
+                S_1 = CONFIG_T::template product<data_T, data_T>::product(QKi[k], dataV[j * CONFIG_T::seq_len + k]);
+                Sij += S_1;
+            }
+            S[j].write(Sij);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lin_projection(hls::stream<data_T> data_q[CONFIG_T::feature_dim], hls::stream<data_T> data_vk[CONFIG_T::feature_dim],
+                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &k_proj,
+                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &q_proj,
+                    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> &v_proj,
+                    typename CONFIG_T::weight_t key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
+                    typename CONFIG_T::bias_t key_bias[CONFIG_T::head_dim_key],
+                    typename CONFIG_T::weight_t query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
+                    typename CONFIG_T::bias_t query_bias[CONFIG_T::head_dim_key],
+                    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
+                    typename CONFIG_T::bias_t value_bias[CONFIG_T::head_dim_value])
+
+{
+    #pragma HLS DATA_PACK variable=k_proj
+    #pragma HLS DATA_PACK variable=q_proj
+    #pragma HLS DATA_PACK variable=v_proj
+
+    #pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
+    #pragma HLS ARRAY_PARTITION variable=data_vk complete dim=1
+
+k_h:
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        #pragma HLS PIPELINE
+
+        data_T proj_k[CONFIG_T::head_dim_key];
+        data_T proj_q[CONFIG_T::head_dim_key];
+        data_T proj_v[CONFIG_T::head_dim_value];
+        data_T in_q[CONFIG_T::feature_dim];
+        data_T in_v[CONFIG_T::feature_dim];
+        #pragma HLS ARRAY_PARTITION variable=proj_k complete dim=1
+        #pragma HLS ARRAY_PARTITION variable=proj_q complete dim=1
+        #pragma HLS ARRAY_PARTITION variable=proj_v complete dim=1
+        #pragma HLS ARRAY_PARTITION variable=in_q complete dim=1
+        #pragma HLS ARRAY_PARTITION variable=in_v complete dim=1
+
+        datapack<CONFIG_T::head_dim_key, data_T> proj_k_pack;
+        datapack<CONFIG_T::head_dim_key, data_T> proj_q_pack;
+        datapack<CONFIG_T::head_dim_value, data_T> proj_v_pack;
+        #pragma HLS DATA_PACK variable=proj_k_pack
+        #pragma HLS DATA_PACK variable=proj_q_pack
+        #pragma HLS DATA_PACK variable=proj_v_pack
+
+        read_stream_array<data_T, CONFIG_T::feature_dim>(data_q, in_q);
+        read_stream_array<data_T, CONFIG_T::feature_dim>(data_vk, in_v);
+
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_k_pack.data, key_weight, key_bias);
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_q, proj_q_pack.data, query_weight, query_bias);
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_v_pack.data, value_weight, value_bias);
+
+        k_proj.write(proj_k_pack);
+        q_proj.write(proj_q_pack);
+        v_proj.write(proj_v_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_out(hls::stream<data_T> data_in[CONFIG_T::num_heads][CONFIG_T::head_dim_value],
+               res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+               typename CONFIG_T::weight_t
+                   attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],
+               typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim]) {
+    data_T mat_res_con[CONFIG_T::num_heads * CONFIG_T::head_dim_value];
+    res_T dense_out[CONFIG_T::feature_dim];
+#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
+#pragma HLS ARRAY_PARTITION variable=dense_out complete dim=1
+output_dense:
+    for (int k = 0; k < CONFIG_T::seq_len; ++k) {
+
+        #pragma HLS PIPELINE
+        for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+            #pragma HLS UNROLL
+            for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
+                #pragma HLS UNROLL
+                mat_res_con[CONFIG_T::head_dim_value * i + j] = data_in[i][j].read();
+            }
+        }
+        dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, dense_out, attention_output_weight,
+                                                              attention_output_bias);
+        for (int i = 0; i < CONFIG_T::feature_dim; ++i) {
+            #pragma HLS UNROLL
+            res[CONFIG_T::feature_dim * k + i] = dense_out[i];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void data_prep(data_T data[CONFIG_T::seq_len * CONFIG_T::feature_dim], hls::stream<data_T> d[CONFIG_T::feature_dim]) {
+    #pragma HLS ARRAY_PARTITION variable=d complete dim=1
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        for (int k = 0; k < CONFIG_T::feature_dim; ++k) {
+            #pragma HLS UNROLL
+            d[k].write(data[j * CONFIG_T::feature_dim + k]);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void multiheadattention(
+    data_T data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim], data_T data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    typename CONFIG_T::weight_t attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value *
+                                                        CONFIG_T::feature_dim], // num_heads,head_size_v,dim
+    typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim],
+    typename CONFIG_T::weight_t
+        key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // n_head,dim,head_dim
+    typename CONFIG_T::bias_t key_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t
+        query_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // same shape as key
+    typename CONFIG_T::bias_t query_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
+    typename CONFIG_T::bias_t value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value]) {
+    hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
+    hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
+    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> q_proj[CONFIG_T::num_heads];
+    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> k_proj[CONFIG_T::num_heads];
+    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> v_proj[CONFIG_T::num_heads];
+    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+    hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
+
+    #pragma HLS DATAFLOW
+    #pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
+    #pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+    #pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
+    #pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+    #pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
+    #pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
+    // std::cout << "input to MHA: " << std::endl;
+    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
+    // std::cout << " " << std::endl;
+
+prepq:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        #pragma HLS UNROLL
+        nnet::data_prep<data_T, res_T, CONFIG_T>(data_q, d_query[i]);
+    }
+prepvk:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        #pragma HLS UNROLL
+        nnet::data_prep<data_T, res_T, CONFIG_T>(data_vk, d_value[i]);
+    }
+
+// linear projection
+lin_proj:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        #pragma HLS UNROLL
+        nnet::lin_projection<data_T, res_T, CONFIG_T>(
+            d_query[i], d_value[i], k_proj[i], q_proj[i], v_proj[i],
+            key_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), key_bias + (CONFIG_T::head_dim_key * i),
+            query_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), query_bias + (CONFIG_T::head_dim_key * i),
+            value_weight + (CONFIG_T::head_dim_value * CONFIG_T::feature_dim * i),
+            value_bias + (CONFIG_T::head_dim_value * i));
+    }
+
+maxtrixmul1:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        #pragma HLS UNROLL
+        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+    }
+
+maxtrixmul2:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        #pragma HLS UNROLL
+        nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]); // stream
+    }
+
+    nnet::dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
+    //    std::cout << "out MHA: " << std::endl;
+    //    nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(res, std::cout);
+    //    std::cout << " " << std::endl;
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_padding.h b/hls4ml/templates/vivado/nnet_utils/nnet_padding.h
index e48a2fb47e..2df5a00705 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_padding.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_padding.h
@@ -1,145 +1,145 @@
-#ifndef NNET_PADDING_H_
-#define NNET_PADDING_H_
-
-#include <math.h>
-
-namespace nnet {
-
-struct padding1d_config {
-    static const unsigned n_chan = 10;
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int j = 0; j < CONFIG_T::n_chan; j++) {
-        for (int i = 0; i < CONFIG_T::pad_left; i++) {
-            *(res++) = 0;
-        }
-
-        for (int i = 0; i < CONFIG_T::in_width; i++) {
-            *(res++) = (res_T) * (data++);
-        }
-
-        for (int i = 0; i < CONFIG_T::pad_right; i++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::pad_left; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_width; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = (res_T) * (data++);
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_right; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-struct padding2d_config {
-    static const unsigned n_chan = 10;
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int k = 0; k < CONFIG_T::n_chan; k++) {
-
-        for (int i = 0; i < CONFIG_T::pad_top; i++) {
-            for (int j = 0; j < CONFIG_T::out_width; j++) {
-                *(res++) = 0;
-            }
-        }
-
-        for (int i = 0; i < CONFIG_T::in_height; i++) {
-            for (int j = 0; j < CONFIG_T::pad_left; j++) {
-                *(res++) = 0;
-            }
-            for (int j = 0; j < CONFIG_T::in_width; j++) {
-                *(res++) = (res_T) * (data++);
-            }
-            for (int j = 0; j < CONFIG_T::pad_right; j++) {
-                *(res++) = 0;
-            }
-        }
-
-        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-            for (int j = 0; j < CONFIG_T::out_width; j++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::pad_top; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_height; i++) {
-        for (int j = 0; j < CONFIG_T::pad_left; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-        for (int j = 0; j < CONFIG_T::in_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = (res_T) * (data++);
-            }
-        }
-        for (int j = 0; j < CONFIG_T::pad_right; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int j = 0; j < CONFIG_T::n_chan; j++) {
+        for (int i = 0; i < CONFIG_T::pad_left; i++) {
+            *(res++) = 0;
+        }
+
+        for (int i = 0; i < CONFIG_T::in_width; i++) {
+            *(res++) = (res_T) * (data++);
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_right; i++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int k = 0; k < CONFIG_T::n_chan; k++) {
+
+        for (int i = 0; i < CONFIG_T::pad_top; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::in_height; i++) {
+            for (int j = 0; j < CONFIG_T::pad_left; j++) {
+                *(res++) = 0;
+            }
+            for (int j = 0; j < CONFIG_T::in_width; j++) {
+                *(res++) = (res_T) * (data++);
+            }
+            for (int j = 0; j < CONFIG_T::pad_right; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
index bb9f0b3f05..c6bec85d40 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
@@ -1,313 +1,313 @@
-#ifndef NNET_POOLING_H_
-#define NNET_POOLING_H_
-
-#include "nnet_helpers.h"
-#include <iostream>
-
-namespace nnet {
-
-// Return the maximum value from an array
-template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
-    T y = x[0];
-    for (int i = 1; i < N; i++) {
-        y = x[i] > y ? x[i] : y;
-    }
-    return y;
-}
-
-// Return the mean value of an array
-template <typename T, int N, typename accum_t> accum_t avg(T (&x)[N], unsigned length) {
-    accum_t y = 0;
-    for (int i = 0; i < N; i++) {
-        y += x[i];
-    }
-    y /= length;
-    return y;
-}
-
-// Enumeration for pooling operation (max, avg, l2norm pooling)
-enum Pool_Op { Max, Average }; // L2Norm };
-template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N], unsigned length) {
-    switch (op) {
-    case Max:
-        return max<T, N, accum_t>(x);
-    case Average:
-        return avg<T, N, accum_t>(x, length);
-        // case L2Norm: return l2norm<T, N>(x);
-    }
-}
-
-template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
-    return pool_op<T, N, op, accum_t>(x, N);
-}
-
-template <typename T, Pool_Op op> T pad_val() {
-    /*---
-     *- In Tensorflow, pooling ignores the value in the padded cells
-     *- For Avg pooling, return 0 (the divisior is modified to the
-     *- area overlapping the unpadded image.
-     *- For max pooling, return the most negative value for the type.
-     *- TODO this is not really generic, it assumes fixed point or integer T
-    ---*/
-    switch (op) {
-    case Max: {
-        T x = 0;
-        x[x.width - 1] = 1;
-        return x;
-        break;
-    }
-    case Average:
-        return 0;
-    }
-}
-
-struct pooling1d_config {
-    // IO size
-    static const unsigned n_in = 10;
-    static const unsigned pool_width = 2;
-    static const unsigned stride_width = 2;
-    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const bool count_pad = false;
-    // Pooling function
-    static const Pool_Op pool_op = Max;
-};
-
-template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
-    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-    // Add any necessary padding
-
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Loop over input image x in steps of stride
-        for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) {
-            unsigned overlap_pixel = 0;
-            data_T pool[CONFIG_T::pool_width];
-            #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-
-            for (int jj = 0; jj < CONFIG_T::pool_width; jj++) {
-                if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) {
-                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
-                    overlap_pixel++;
-                } else
-                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
-            }
-
-            int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel;
-
-            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
-                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool, patch_size);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        data_T pool[CONFIG_T::n_in];
-        #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
-            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
-        }
-        // do the pooling
-        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool);
-    }
-}
-
-struct pooling2d_config {
-    // IO size
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned n_filt = 4;
-    static const unsigned stride_height = 2;
-    static const unsigned stride_width = 2;
-    static const unsigned pool_height = 2;
-    static const unsigned pool_width = 2;
-    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
-    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
-    // Padding
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const bool count_pad = false;
-    // Pooling function
-    static const Pool_Op pool_op = Max;
-    // Reuse factor
-    static const unsigned reuse_factor = 1;
-
-    // Internal data type definitions
-    typedef float accum_t;
-};
-
-template <typename CONFIG_T> constexpr int pool_op_limit() {
-    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-
-        // Loop over input image y in steps of stride
-        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
-            // Loop over input image x in steps of stride
-            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
-                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
-                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-
-                unsigned overlap_pixel = 0;
-
-                // Loop over pool window y
-                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
-                    // Loop over pool window x
-                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
-                        bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top;
-                        bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left;
-                        if (cond1 && cond2) {
-                            unsigned data_idx =
-                                ((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) *
-                                    CONFIG_T::n_filt +
-                                ff;
-                            pool[kk * CONFIG_T::stride_width + ll] = data[data_idx];
-                            overlap_pixel++;
-                        } else
-                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
-                    }
-                }
-
-                int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel;
-
-                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
-                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
-                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
-                            typename CONFIG_T::accum_t>(pool, patch_size);
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Loop over input image y in steps of stride
-        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
-            // Loop over input image x in steps of stride
-            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
-                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
-                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-                // Keep track of number of pixels in image vs padding region
-                unsigned img_overlap = 0;
-                // Loop over pool window y
-                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
-                    // Loop over pool window x
-                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
-                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) ||
-                            jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) {
-                            // Add padding
-                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
-                            if (CONFIG_T::count_pad)
-                                img_overlap++;
-                        } else {
-                            pool[kk * CONFIG_T::stride_width + ll] =
-                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
-                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
-                            img_overlap++;
-                        }
-                    }
-                }
-                // do the pooling
-                // TODO in the case of average pooling, need to reduce height * width to area of pool window
-                // not overlapping padding region
-                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
-                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
-                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
-                            typename CONFIG_T::accum_t>(pool);
-                // If the pool op is Average, the zero-padding needs to be removed from the results
-                if (CONFIG_T::pool_op == Average) {
-                    data_T rescale =
-                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
-                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
-                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
-                }
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                         res_T res[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
-    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
-    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
-
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
-
-FiltLoop:
-    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
-        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
-
-    InputLoop:
-        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
-            pool[i] = data[i * CONFIG_T::n_filt + filt];
-        }
-
-        res[filt] = static_cast<res_T>(
-            pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include "nnet_helpers.h"
+#include <iostream>
+
+namespace nnet {
+
+// Return the maximum value from an array
+template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
+    T y = x[0];
+    for (int i = 1; i < N; i++) {
+        y = x[i] > y ? x[i] : y;
+    }
+    return y;
+}
+
+// Return the mean value of an array
+template <typename T, int N, typename accum_t> accum_t avg(T (&x)[N], unsigned length) {
+    accum_t y = 0;
+    for (int i = 0; i < N; i++) {
+        y += x[i];
+    }
+    y /= length;
+    return y;
+}
+
+// Enumeration for pooling operation (max, avg, l2norm pooling)
+enum Pool_Op { Max, Average }; // L2Norm };
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N], unsigned length) {
+    switch (op) {
+    case Max:
+        return max<T, N, accum_t>(x);
+    case Average:
+        return avg<T, N, accum_t>(x, length);
+        // case L2Norm: return l2norm<T, N>(x);
+    }
+}
+
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
+    return pool_op<T, N, op, accum_t>(x, N);
+}
+
+template <typename T, Pool_Op op> T pad_val() {
+    /*---
+     *- In Tensorflow, pooling ignores the value in the padded cells
+     *- For Avg pooling, return 0 (the divisior is modified to the
+     *- area overlapping the unpadded image.
+     *- For max pooling, return the most negative value for the type.
+     *- TODO this is not really generic, it assumes fixed point or integer T
+    ---*/
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+        break;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+struct pooling1d_config {
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
+    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image x in steps of stride
+        for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) {
+            unsigned overlap_pixel = 0;
+            data_T pool[CONFIG_T::pool_width];
+            #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+
+            for (int jj = 0; jj < CONFIG_T::pool_width; jj++) {
+                if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) {
+                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
+                    overlap_pixel++;
+                } else
+                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
+            }
+
+            int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel;
+
+            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool, patch_size);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        data_T pool[CONFIG_T::n_in];
+        #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
+        }
+        // do the pooling
+        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool);
+    }
+}
+
+struct pooling2d_config {
+    // IO size
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+    // Reuse factor
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit() {
+    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+
+                unsigned overlap_pixel = 0;
+
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top;
+                        bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left;
+                        if (cond1 && cond2) {
+                            unsigned data_idx =
+                                ((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) *
+                                    CONFIG_T::n_filt +
+                                ff;
+                            pool[kk * CONFIG_T::stride_width + ll] = data[data_idx];
+                            overlap_pixel++;
+                        } else
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                    }
+                }
+
+                int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel;
+
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                            typename CONFIG_T::accum_t>(pool, patch_size);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) ||
+                            jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad)
+                                img_overlap++;
+                        } else {
+                            pool[kk * CONFIG_T::stride_width + ll] =
+                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
+                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                            typename CONFIG_T::accum_t>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average) {
+                    data_T rescale =
+                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                         res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+
+FiltLoop:
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+
+    InputLoop:
+        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+            pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<res_T>(
+            pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h b/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h
index f68d80663b..3e1ebb225d 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h
@@ -1,56 +1,56 @@
-#ifndef NNET_RECR_ACTIVATION_H_
-#define NNET_RECR_ACTIVATION_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <math.h>
-
-namespace nnet {
-
-namespace activation {
-
-template <class data_T, class res_T, typename CONFIG_T> class Activation {
-  public:
-    // *************************************************
-    //       Blank Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       Relu Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       Sigmoid Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       TanH Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-} // namespace activation
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_RECR_ACTIVATION_H_
+#define NNET_RECR_ACTIVATION_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+namespace activation {
+
+template <class data_T, class res_T, typename CONFIG_T> class Activation {
+  public:
+    // *************************************************
+    //       Blank Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Relu Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Sigmoid Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       TanH Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+} // namespace activation
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
index d3b96ba5fb..bd8c0e05a9 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
@@ -1,586 +1,586 @@
-#ifndef NNET_RECURSIVE_H_
-#define NNET_RECURSIVE_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include "nnet_recr_activations.h"
-
-namespace nnet {
-
-struct lstm_config {
-    // Internal data type definitions
-    typedef float weight_t;
-    typedef float bias_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 2;
-    static const unsigned n_parts = 20;
-    static const unsigned n_out = 2;
-    static const unsigned n_state = 2;
-    static const unsigned n_4state = 8;
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-    static const bool store_weights_in_bram = false;
-    static const bool use_static = true;
-
-    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
-    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
-};
-// Long Short term Memory NN (LSTM)
-// Resources:
-// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
-// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
-// https://en.wikipedia.org/wiki/Long_short-term_memory
-// Notes:
-//  - LSTM naming conventions adopted from the above links
-//      - s_newstate = activation(U*input + W*state)
-//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
-//  - If softmax is needed on output, perform *outside* this operations
-//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
-//  dense network at the end)
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-    // Initialize the state variable -- will maintain state between function calls
-
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
-    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
-    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
-
-    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
-
-    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        if (iacc > 2 * CONFIG_T::n_state - 1)
-            index = iacc + CONFIG_T::n_state;
-        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
-    }
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
-    }
-
-    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
-        inputacc_ifo, tmpres_ifo);
-
-    // Now for the confusion matrix
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        inputacc_c, tmpres_c);
-
-    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
-    }
-    // Operation: h=act(s)*o
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        s_newstate, s_actstate);
-
-    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
-        #pragma HLS UNROLL
-        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-                 res_T s_newstate[CONFIG_T::n_state],
-                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-    static res_T h_state[CONFIG_T::n_state];
-    static res_T s_state[CONFIG_T::n_state];
-    // Initialize the state variable -- will maintain state between function calls
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
-    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=h_state      complete
-    #pragma HLS ARRAY_PARTITION variable=s_state      complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
-    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
-
-    if (reset_state) {
-        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
-            #pragma HLS UNROLL
-            s_state[i_state] = 0;
-            h_state[i_state] = 0;
-        }
-    }
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
-                                                                                    param_br);
-
-    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        if (iacc > 2 * CONFIG_T::n_state - 1)
-            index = iacc + CONFIG_T::n_state;
-        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
-    }
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
-    }
-
-    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
-        inputacc_ifo, tmpres_ifo);
-
-    // Now for the confusion matrix
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        inputacc_c, tmpres_c);
-
-    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
-        s_newstate[iacc] = s_state[iacc];
-    }
-    // Operation: h=act(s)*o
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        s_state, s_actstate);
-
-    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
-        #pragma HLS UNROLL
-        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
-        h_newstate[iacc] = h_state[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-
-    res_T h_newstate[CONFIG_T::n_state];
-    res_T s_newstate[CONFIG_T::n_state];
-    data_T data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_newstate[ii] = 0;
-        s_newstate[ii] = 0;
-    }
-    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
-        for (int j = 0; j < CONFIG_T::n_in; j++) {
-            #pragma HLS UNROLL
-            data_in[j] = data[j + iloop * CONFIG_T::n_in];
-        }
-        if (CONFIG_T::use_static)
-            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
-                                                       param_br);
-        else
-            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
-                                                param_br);
-        if (CONFIG_T::n_sequence_out > 1)
-            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
-                #pragma HLS UNROLL
-                res[i] = h_newstate[j];
-            }
-        reset_state = false;
-    }
-    if (CONFIG_T::n_sequence_out == 1)
-        for (int i = 0; i < (CONFIG_T::n_state); i++) {
-            #pragma HLS UNROLL
-            res[i] = h_newstate[i];
-        }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-
-    typename res_T::value_type h_newstate[CONFIG_T::n_state];
-    typename res_T::value_type s_newstate[CONFIG_T::n_state];
-    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_newstate[ii] = 0;
-        s_newstate[ii] = 0;
-    }
-
-    typename data_T::value_type data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-DataPropagation:
-    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
-        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
-            // #pragma HLS PIPELINE
-        }
-        data_T data_pack = data_stream.read();
-    DataPack:
-        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            data_in[i_pack] = data_pack[i_pack];
-        }
-        if (CONFIG_T::use_static)
-            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
-        else
-            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1) {
-            res_T res_pack;
-            PRAGMA_DATA_PACK(res_pack)
-        ResPack_sequences:
-            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-                #pragma HLS UNROLL
-                res_pack[i_pack] = h_newstate[i_pack];
-            }
-            res_stream.write(res_pack);
-        }
-        reset_state = false;
-    }
-
-    if (CONFIG_T::n_sequence_out == 1) {
-        res_T res_pack;
-        PRAGMA_DATA_PACK(res_pack)
-    ResPack:
-        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            res_pack[i_pack] = h_newstate[i_pack];
-        }
-        res_stream.write(res_pack);
-    }
-}
-
-// Struct for the GRU template
-
-struct gru_config {
-    // Internal data type definitions
-    typedef float weight_t;
-    typedef float bias_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 2;
-    static const unsigned n_out = 2;
-    static const unsigned n_state = 2;
-    static const unsigned n_sequence = 2;
-    static const unsigned n_4state = 8;
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const bool use_static = true;
-    static const bool pytorch_order = false;
-    static const unsigned n_zeros = 0;
-
-    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
-    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
-                                                                                    // weights - refer page in copy!!
-         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-    // Initialize the state variable -- will maintain state between function calls
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
-                                                                                    param_br);
-
-    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
-    // initialized with biases -- DONE
-    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
-    }
-
-    // Activation function Sub layer -- START
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
-
-    // Activation function Sub layer -- END
-
-    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-        else
-            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-    }
-
-    // Assuming reset_after is false
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
-    }
-
-    // Now run the activation on this guy
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
-
-    // Mix the stat with the previous state
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
-                                       h_newstate[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
-        else
-            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-    // Initialize the state variable -- will maintain state between function calls
-
-    static res_T h_state[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
-
-    #pragma HLS ARRAY_PARTITION variable=h_state         complete
-    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
-
-    if (reset_state) {
-        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
-            #pragma HLS UNROLL
-            h_state[i_h_state] = 0;
-        }
-    }
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
-                                                                                    param_br);
-
-    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
-    // initialized with biases -- DONE
-    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
-    }
-
-    // Activation function Sub layer -- START
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
-
-    // Activation function Sub layer -- END
-
-    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-        else
-            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-    }
-
-    // Assuming reset_after is false
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
-    }
-
-    // Now run the activation on this guy
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
-
-    // Mix the stat with the previous state
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
-                                    h_state[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
-        else
-            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
-        h_newstate[iacc] = h_state[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
-               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-
-    res_T h_state[CONFIG_T::n_state];
-    data_T data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-    #pragma HLS ARRAY_PARTITION variable=h_state complete
-    #pragma HLS ARRAY_PARTITION variable=data_in complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_state[ii] = 0;
-    }
-    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
-        for (int j = 0; j < CONFIG_T::n_in; j++) {
-            #pragma HLS UNROLL
-            data_in[j] = data[j + iloop * CONFIG_T::n_in];
-        }
-        if (CONFIG_T::use_static)
-            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
-        else
-            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1)
-            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
-                #pragma HLS UNROLL
-                res[i] = h_state[j];
-            }
-        reset_state = false;
-    }
-    if (CONFIG_T::n_sequence_out == 1)
-        for (int i = 0; i < (CONFIG_T::n_state); i++) {
-            #pragma HLS UNROLL
-            res[i] = h_state[i];
-        }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
-               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-
-    typename res_T::value_type h_newstate[CONFIG_T::n_state];
-    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_newstate[ii] = 0;
-    }
-
-    typename data_T::value_type data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-DataPropagation:
-    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
-        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
-            // #pragma HLS PIPELINE
-        }
-        data_T data_pack = data_stream.read();
-    DataPack:
-        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            data_in[i_pack] = data_pack[i_pack];
-        }
-        if (CONFIG_T::use_static)
-            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
-        else
-            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
-                                                                                         param, param_zr, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1) {
-            res_T res_pack;
-            PRAGMA_DATA_PACK(res_pack)
-        ResPack_sequences:
-            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-                #pragma HLS UNROLL
-                res_pack[i_pack] = h_newstate[i_pack];
-            }
-            res_stream.write(res_pack);
-        }
-        reset_state = false;
-    }
-
-    if (CONFIG_T::n_sequence_out == 1) {
-        res_T res_pack;
-        PRAGMA_DATA_PACK(res_pack)
-    ResPack:
-        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            res_pack[i_pack] = h_newstate[i_pack];
-        }
-        res_stream.write(res_pack);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_RECURSIVE_H_
+#define NNET_RECURSIVE_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recr_activations.h"
+
+namespace nnet {
+
+struct lstm_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_parts = 20;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+// Long Short term Memory NN (LSTM)
+// Resources:
+// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
+// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
+// https://en.wikipedia.org/wiki/Long_short-term_memory
+// Notes:
+//  - LSTM naming conventions adopted from the above links
+//      - s_newstate = activation(U*input + W*state)
+//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
+//  - If softmax is needed on output, perform *outside* this operations
+//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
+//  dense network at the end)
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_newstate, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        #pragma HLS UNROLL
+        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                 res_T s_newstate[CONFIG_T::n_state],
+                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    static res_T h_state[CONFIG_T::n_state];
+    static res_T s_state[CONFIG_T::n_state];
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=h_state      complete
+    #pragma HLS ARRAY_PARTITION variable=s_state      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    if (reset_state) {
+        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
+            #pragma HLS UNROLL
+            s_state[i_state] = 0;
+            h_state[i_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
+                                                                                    param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+        s_newstate[iacc] = s_state[iacc];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_state, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        #pragma HLS UNROLL
+        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    res_T h_newstate[CONFIG_T::n_state];
+    res_T s_newstate[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            #pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                       param_br);
+        else
+            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                #pragma HLS UNROLL
+                res[i] = h_newstate[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            #pragma HLS UNROLL
+            res[i] = h_newstate[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    typename res_T::value_type s_newstate[CONFIG_T::n_state];
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // #pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        else
+            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+// Struct for the GRU template
+
+struct gru_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_sequence = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+    static const bool pytorch_order = false;
+    static const unsigned n_zeros = 0;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
+                                                                                    // weights - refer page in copy!!
+         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+        else
+            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
+                                       h_newstate[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
+        else
+            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    static res_T h_state[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    #pragma HLS ARRAY_PARTITION variable=h_state         complete
+    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    if (reset_state) {
+        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
+            #pragma HLS UNROLL
+            h_state[i_h_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+        else
+            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
+                                    h_state[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
+        else
+            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    res_T h_state[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    #pragma HLS ARRAY_PARTITION variable=h_state complete
+    #pragma HLS ARRAY_PARTITION variable=data_in complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_state[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            #pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                #pragma HLS UNROLL
+                res[i] = h_state[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            #pragma HLS UNROLL
+            res[i] = h_state[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // #pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
+                                                                                         param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/vivado_synth.tcl b/hls4ml/templates/vivado/vivado_synth.tcl
index 5a2bcfc453..fba1387c5a 100644
--- a/hls4ml/templates/vivado/vivado_synth.tcl
+++ b/hls4ml/templates/vivado/vivado_synth.tcl
@@ -1,6 +1,6 @@
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-add_files ${project_name}_prj/solution1/syn/verilog
-synth_design -top ${project_name} -part $part
-report_utilization -file vivado_synth.rpt
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+add_files ${project_name}_prj/solution1/syn/verilog
+synth_design -top ${project_name} -part $part
+report_utilization -file vivado_synth.rpt
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
index c4a76ef0c3..c532d2fa14 100644
--- a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
@@ -1,422 +1,422 @@
-/**
-* Copyright (C) 2019-2021 Xilinx, Inc
-*
-* Licensed under the Apache License, Version 2.0 (the "License"). You may
-* not use this file except in compliance with the License. A copy of the
-* License is located at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-* License for the specific language governing permissions and limitations
-* under the License.
-*/
-
-`timescale 1ns/1ps
-module krnl_rtl_control_s_axi
-#(parameter
-    C_S_AXI_ADDR_WIDTH = 6,
-    C_S_AXI_DATA_WIDTH = 32
-)(
-    // axi4 lite slave signals
-    input  wire                          ACLK,
-    input  wire                          ARESET,
-    input  wire                          ACLK_EN,
-    input  wire [C_S_AXI_ADDR_WIDTH-1:0] AWADDR,
-    input  wire                          AWVALID,
-    output wire                          AWREADY,
-    input  wire [C_S_AXI_DATA_WIDTH-1:0] WDATA,
-    input  wire [C_S_AXI_DATA_WIDTH/8-1:0] WSTRB,
-    input  wire                          WVALID,
-    output wire                          WREADY,
-    output wire [1:0]                    BRESP,
-    output wire                          BVALID,
-    input  wire                          BREADY,
-    input  wire [C_S_AXI_ADDR_WIDTH-1:0] ARADDR,
-    input  wire                          ARVALID,
-    output wire                          ARREADY,
-    output wire [C_S_AXI_DATA_WIDTH-1:0] RDATA,
-    output wire [1:0]                    RRESP,
-    output wire                          RVALID,
-    input  wire                          RREADY,
-    output wire                          interrupt,
-    // user signals
-    output wire                          ap_start,
-    input  wire                          ap_done,
-    input  wire                          ap_ready,
-    input  wire                          ap_idle,
-    output wire [63:0]                   fifo_in,
-    output wire [63:0]                   fifo_out,
-    output wire [31:0]                   length_r_in,
-    output wire [31:0]                   length_r_out
-);
-//------------------------Address Info-------------------
-// 0x00 : Control signals
-//        bit 0  - ap_start (Read/Write/COH)
-//        bit 1  - ap_done (Read/COR)
-//        bit 2  - ap_idle (Read)
-//        bit 3  - ap_ready (Read)
-//        bit 7  - auto_restart (Read/Write)
-//        others - reserved
-// 0x04 : Global Interrupt Enable Register
-//        bit 0  - Global Interrupt Enable (Read/Write)
-//        others - reserved
-// 0x08 : IP Interrupt Enable Register (Read/Write)
-//        bit 0  - Channel 0 (ap_done)
-//        bit 1  - Channel 1 (ap_ready)
-//        others - reserved
-// 0x0c : IP Interrupt Status Register (Read/TOW)
-//        bit 0  - Channel 0 (ap_done)
-//        bit 1  - Channel 1 (ap_ready)
-//        others - reserved
-// 0x10 : Data signal of fifo_in
-//        bit 31~0 - a[31:0] (Read/Write)
-// 0x14 : Data signal of fifo_in
-//        bit 31~0 - a[63:32] (Read/Write)
-// 0x18 : reserved
-// 0x1c : Data signal of fifo_out
-//        bit 31~0 - b[31:0] (Read/Write)
-// 0x20 : Data signal of fifo_out
-//        bit 31~0 - b[63:32] (Read/Write)
-// 0x24 : reserved
-// 0x28 : Data signal of length_r_in
-//        bit 31~0 - length_r[31:0] (Read/Write)
-// 0x2c : reserved
-// 0x30 : Data signal of length_r_out
-//        bit 31~0 - length_r[31:0] (Read/Write)
-// 0x34 : reserved
-// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
-
-//------------------------Parameter----------------------
-localparam
-    ADDR_AP_CTRL         = 6'h00,
-    ADDR_GIE             = 6'h04,
-    ADDR_IER             = 6'h08,
-    ADDR_ISR             = 6'h0c,
-    ADDR_FIFO_IN_DATA_0  = 6'h10,
-    ADDR_FIFO_IN_DATA_1  = 6'h14,
-    ADDR_FIFO_IN_CTRL    = 6'h18,
-    ADDR_FIFO_OUT_DATA_0 = 6'h1c,
-    ADDR_FIFO_OUT_DATA_1 = 6'h20,
-    ADDR_FIFO_OUT_CTRL   = 6'h24,
-    ADDR_LENGTH_R_IN_DATA_0  = 6'h28,
-    ADDR_LENGTH_R_IN_CTRL    = 6'h2c,
-    ADDR_LENGTH_R_OUT_DATA_0 = 6'h30,
-    ADDR_LENGTH_R_OUT_CTRL   = 6'h34,
-    WRIDLE               = 2'd0,
-    WRDATA               = 2'd1,
-    WRRESP               = 2'd2,
-    RDIDLE               = 2'd0,
-    RDDATA               = 2'd1,
-    ADDR_BITS         = 6;
-
-//------------------------Local signal-------------------
-    reg  [1:0]                    wstate = WRIDLE;
-    reg  [1:0]                    wnext;
-    reg  [ADDR_BITS-1:0]          waddr;
-    wire [31:0]                   wmask;
-    wire                          aw_hs;
-    wire                          w_hs;
-    reg  [1:0]                    rstate = RDIDLE;
-    reg  [1:0]                    rnext;
-    reg  [31:0]                   rdata;
-    wire                          ar_hs;
-    wire [ADDR_BITS-1:0]          raddr;
-    // internal registers
-    wire                          int_ap_idle;
-    wire                          int_ap_ready;
-    reg                           int_ap_done = 1'b0;
-    reg                           int_ap_start = 1'b0;
-    reg                           int_auto_restart = 1'b0;
-    reg                           int_gie = 2'b0;
-    reg  [1:0]                    int_ier = 2'b0;
-    reg  [1:0]                    int_isr = 2'b0;
-    reg  [63:0]                   int_fifo_in      = 64'b0;
-    reg  [63:0]                   int_fifo_out     = 64'b0;
-    reg  [63:0]                   int_length_r_in  = 32'b0;
-    reg  [31:0]                   int_length_r_out = 32'b0;
-
-//------------------------Instantiation------------------
-
-//------------------------AXI write fsm------------------
-assign AWREADY = (~ARESET) & (wstate == WRIDLE);
-assign WREADY  = (wstate == WRDATA);
-assign BRESP   = 2'b00;  // OKAY
-assign BVALID  = (wstate == WRRESP);
-assign wmask   = { {8{WSTRB[3]}}, {8{WSTRB[2]}}, {8{WSTRB[1]}}, {8{WSTRB[0]}} };
-assign aw_hs   = AWVALID & AWREADY;
-assign w_hs    = WVALID & WREADY;
-
-// wstate
-always @(posedge ACLK) begin
-    if (ARESET)
-        wstate <= WRIDLE;
-    else if (ACLK_EN)
-        wstate <= wnext;
-end
-
-// wnext
-always @(*) begin
-    case (wstate)
-        WRIDLE:
-            if (AWVALID)
-                wnext = WRDATA;
-            else
-                wnext = WRIDLE;
-        WRDATA:
-            if (WVALID)
-                wnext = WRRESP;
-            else
-                wnext = WRDATA;
-        WRRESP:
-            if (BREADY)
-                wnext = WRIDLE;
-            else
-                wnext = WRRESP;
-        default:
-            wnext = WRIDLE;
-    endcase
-end
-
-// waddr
-always @(posedge ACLK) begin
-    if (ACLK_EN) begin
-        if (aw_hs)
-            waddr <= AWADDR[ADDR_BITS-1:0];
-    end
-end
-
-//------------------------AXI read fsm-------------------
-assign ARREADY = (~ARESET) && (rstate == RDIDLE);
-assign RDATA   = rdata;
-assign RRESP   = 2'b00;  // OKAY
-assign RVALID  = (rstate == RDDATA);
-assign ar_hs   = ARVALID & ARREADY;
-assign raddr   = ARADDR[ADDR_BITS-1:0];
-
-// rstate
-always @(posedge ACLK) begin
-    if (ARESET)
-        rstate <= RDIDLE;
-    else if (ACLK_EN)
-        rstate <= rnext;
-end
-
-// rnext
-always @(*) begin
-    case (rstate)
-        RDIDLE:
-            if (ARVALID)
-                rnext = RDDATA;
-            else
-                rnext = RDIDLE;
-        RDDATA:
-            if (RREADY & RVALID)
-                rnext = RDIDLE;
-            else
-                rnext = RDDATA;
-        default:
-            rnext = RDIDLE;
-    endcase
-end
-
-// rdata
-always @(posedge ACLK) begin
-    if (ACLK_EN) begin
-        if (ar_hs) begin
-            rdata <= 1'b0;
-            case (raddr)
-                ADDR_AP_CTRL: begin
-                    rdata[0] <= int_ap_start;
-                    rdata[1] <= int_ap_done;
-                    rdata[2] <= int_ap_idle;
-                    rdata[3] <= int_ap_ready;
-                    rdata[7] <= int_auto_restart;
-                end
-                ADDR_GIE: begin
-                    rdata <= int_gie;
-                end
-                ADDR_IER: begin
-                    rdata <= int_ier;
-                end
-                ADDR_ISR: begin
-                    rdata <= int_isr;
-                end
-                ADDR_FIFO_IN_DATA_0: begin
-                    rdata <= int_fifo_in[31:0];
-                end
-                ADDR_FIFO_IN_DATA_1: begin
-                    rdata <= int_fifo_in[63:32];
-                end
-                ADDR_FIFO_OUT_DATA_0: begin
-                    rdata <= int_fifo_out[31:0];
-                end
-                ADDR_FIFO_OUT_DATA_1: begin
-                    rdata <= int_fifo_out[63:32];
-                end
-                ADDR_LENGTH_R_IN_DATA_0: begin
-                    rdata <= int_length_r_in[31:0];
-                end
-                ADDR_LENGTH_R_OUT_DATA_0: begin
-                    rdata <= int_length_r_out[31:0];
-                end
-            endcase
-        end
-    end
-end
-
-
-//------------------------Register logic-----------------
-assign interrupt     = int_gie & (|int_isr);
-assign ap_start      = int_ap_start;
-assign int_ap_idle   = ap_idle;
-assign int_ap_ready  = ap_ready;
-assign fifo_in       = int_fifo_in;
-assign fifo_out      = int_fifo_out;
-assign length_r_in   = int_length_r_in;
-assign length_r_out  = int_length_r_out;
-// int_ap_start
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_ap_start <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0] && WDATA[0])
-            int_ap_start <= 1'b1;
-        else if (int_ap_ready)
-            int_ap_start <= int_auto_restart; // clear on handshake/auto restart
-    end
-end
-
-// int_ap_done
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_ap_done <= 1'b0;
-    else if (ACLK_EN) begin
-        if (ap_done)
-            int_ap_done <= 1'b1;
-        else if (ar_hs && raddr == ADDR_AP_CTRL)
-            int_ap_done <= 1'b0; // clear on read
-    end
-end
-
-// int_auto_restart
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_auto_restart <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0])
-            int_auto_restart <=  WDATA[7];
-    end
-end
-
-// int_gie
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_gie <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_GIE && WSTRB[0])
-            int_gie <= WDATA[0];
-    end
-end
-
-// int_ier
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_ier <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_IER && WSTRB[0])
-            int_ier <= WDATA[1:0];
-    end
-end
-
-// int_isr[0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_isr[0] <= 1'b0;
-    else if (ACLK_EN) begin
-        if (int_ier[0] & ap_done)
-            int_isr[0] <= 1'b1;
-        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
-            int_isr[0] <= int_isr[0] ^ WDATA[0]; // toggle on write
-    end
-end
-
-// int_isr[1]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_isr[1] <= 1'b0;
-    else if (ACLK_EN) begin
-        if (int_ier[1] & ap_ready)
-            int_isr[1] <= 1'b1;
-        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
-            int_isr[1] <= int_isr[1] ^ WDATA[1]; // toggle on write
-    end
-end
-
-// int_fifo_in[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_in[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_IN_DATA_0)
-            int_fifo_in[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_in[31:0] & ~wmask);
-    end
-end
-
-// int_fifo_in[63:32]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_in[63:32] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_IN_DATA_1)
-            int_fifo_in[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_in[63:32] & ~wmask);
-    end
-end
-
-// int_fifo_out[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_out[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_0)
-            int_fifo_out[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_out[31:0] & ~wmask);
-    end
-end
-
-// int_fifo_out[63:32]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_out[63:32] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_1)
-            int_fifo_out[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_out[63:32] & ~wmask);
-    end
-end
-
-// int_length_r_in[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_length_r_in[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_LENGTH_R_IN_DATA_0)
-            int_length_r_in[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_in[31:0] & ~wmask);
-    end
-end
-
-
-// int_length_r_out[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_length_r_out[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_LENGTH_R_OUT_DATA_0)
-            int_length_r_out[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_out[31:0] & ~wmask);
-    end
-end
-
-
-//------------------------Memory logic-------------------
-
-endmodule
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+`timescale 1ns/1ps
+module krnl_rtl_control_s_axi
+#(parameter
+    C_S_AXI_ADDR_WIDTH = 6,
+    C_S_AXI_DATA_WIDTH = 32
+)(
+    // axi4 lite slave signals
+    input  wire                          ACLK,
+    input  wire                          ARESET,
+    input  wire                          ACLK_EN,
+    input  wire [C_S_AXI_ADDR_WIDTH-1:0] AWADDR,
+    input  wire                          AWVALID,
+    output wire                          AWREADY,
+    input  wire [C_S_AXI_DATA_WIDTH-1:0] WDATA,
+    input  wire [C_S_AXI_DATA_WIDTH/8-1:0] WSTRB,
+    input  wire                          WVALID,
+    output wire                          WREADY,
+    output wire [1:0]                    BRESP,
+    output wire                          BVALID,
+    input  wire                          BREADY,
+    input  wire [C_S_AXI_ADDR_WIDTH-1:0] ARADDR,
+    input  wire                          ARVALID,
+    output wire                          ARREADY,
+    output wire [C_S_AXI_DATA_WIDTH-1:0] RDATA,
+    output wire [1:0]                    RRESP,
+    output wire                          RVALID,
+    input  wire                          RREADY,
+    output wire                          interrupt,
+    // user signals
+    output wire                          ap_start,
+    input  wire                          ap_done,
+    input  wire                          ap_ready,
+    input  wire                          ap_idle,
+    output wire [63:0]                   fifo_in,
+    output wire [63:0]                   fifo_out,
+    output wire [31:0]                   length_r_in,
+    output wire [31:0]                   length_r_out
+);
+//------------------------Address Info-------------------
+// 0x00 : Control signals
+//        bit 0  - ap_start (Read/Write/COH)
+//        bit 1  - ap_done (Read/COR)
+//        bit 2  - ap_idle (Read)
+//        bit 3  - ap_ready (Read)
+//        bit 7  - auto_restart (Read/Write)
+//        others - reserved
+// 0x04 : Global Interrupt Enable Register
+//        bit 0  - Global Interrupt Enable (Read/Write)
+//        others - reserved
+// 0x08 : IP Interrupt Enable Register (Read/Write)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x0c : IP Interrupt Status Register (Read/TOW)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x10 : Data signal of fifo_in
+//        bit 31~0 - a[31:0] (Read/Write)
+// 0x14 : Data signal of fifo_in
+//        bit 31~0 - a[63:32] (Read/Write)
+// 0x18 : reserved
+// 0x1c : Data signal of fifo_out
+//        bit 31~0 - b[31:0] (Read/Write)
+// 0x20 : Data signal of fifo_out
+//        bit 31~0 - b[63:32] (Read/Write)
+// 0x24 : reserved
+// 0x28 : Data signal of length_r_in
+//        bit 31~0 - length_r[31:0] (Read/Write)
+// 0x2c : reserved
+// 0x30 : Data signal of length_r_out
+//        bit 31~0 - length_r[31:0] (Read/Write)
+// 0x34 : reserved
+// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
+
+//------------------------Parameter----------------------
+localparam
+    ADDR_AP_CTRL         = 6'h00,
+    ADDR_GIE             = 6'h04,
+    ADDR_IER             = 6'h08,
+    ADDR_ISR             = 6'h0c,
+    ADDR_FIFO_IN_DATA_0  = 6'h10,
+    ADDR_FIFO_IN_DATA_1  = 6'h14,
+    ADDR_FIFO_IN_CTRL    = 6'h18,
+    ADDR_FIFO_OUT_DATA_0 = 6'h1c,
+    ADDR_FIFO_OUT_DATA_1 = 6'h20,
+    ADDR_FIFO_OUT_CTRL   = 6'h24,
+    ADDR_LENGTH_R_IN_DATA_0  = 6'h28,
+    ADDR_LENGTH_R_IN_CTRL    = 6'h2c,
+    ADDR_LENGTH_R_OUT_DATA_0 = 6'h30,
+    ADDR_LENGTH_R_OUT_CTRL   = 6'h34,
+    WRIDLE               = 2'd0,
+    WRDATA               = 2'd1,
+    WRRESP               = 2'd2,
+    RDIDLE               = 2'd0,
+    RDDATA               = 2'd1,
+    ADDR_BITS         = 6;
+
+//------------------------Local signal-------------------
+    reg  [1:0]                    wstate = WRIDLE;
+    reg  [1:0]                    wnext;
+    reg  [ADDR_BITS-1:0]          waddr;
+    wire [31:0]                   wmask;
+    wire                          aw_hs;
+    wire                          w_hs;
+    reg  [1:0]                    rstate = RDIDLE;
+    reg  [1:0]                    rnext;
+    reg  [31:0]                   rdata;
+    wire                          ar_hs;
+    wire [ADDR_BITS-1:0]          raddr;
+    // internal registers
+    wire                          int_ap_idle;
+    wire                          int_ap_ready;
+    reg                           int_ap_done = 1'b0;
+    reg                           int_ap_start = 1'b0;
+    reg                           int_auto_restart = 1'b0;
+    reg                           int_gie = 2'b0;
+    reg  [1:0]                    int_ier = 2'b0;
+    reg  [1:0]                    int_isr = 2'b0;
+    reg  [63:0]                   int_fifo_in      = 64'b0;
+    reg  [63:0]                   int_fifo_out     = 64'b0;
+    reg  [63:0]                   int_length_r_in  = 32'b0;
+    reg  [31:0]                   int_length_r_out = 32'b0;
+
+//------------------------Instantiation------------------
+
+//------------------------AXI write fsm------------------
+assign AWREADY = (~ARESET) & (wstate == WRIDLE);
+assign WREADY  = (wstate == WRDATA);
+assign BRESP   = 2'b00;  // OKAY
+assign BVALID  = (wstate == WRRESP);
+assign wmask   = { {8{WSTRB[3]}}, {8{WSTRB[2]}}, {8{WSTRB[1]}}, {8{WSTRB[0]}} };
+assign aw_hs   = AWVALID & AWREADY;
+assign w_hs    = WVALID & WREADY;
+
+// wstate
+always @(posedge ACLK) begin
+    if (ARESET)
+        wstate <= WRIDLE;
+    else if (ACLK_EN)
+        wstate <= wnext;
+end
+
+// wnext
+always @(*) begin
+    case (wstate)
+        WRIDLE:
+            if (AWVALID)
+                wnext = WRDATA;
+            else
+                wnext = WRIDLE;
+        WRDATA:
+            if (WVALID)
+                wnext = WRRESP;
+            else
+                wnext = WRDATA;
+        WRRESP:
+            if (BREADY)
+                wnext = WRIDLE;
+            else
+                wnext = WRRESP;
+        default:
+            wnext = WRIDLE;
+    endcase
+end
+
+// waddr
+always @(posedge ACLK) begin
+    if (ACLK_EN) begin
+        if (aw_hs)
+            waddr <= AWADDR[ADDR_BITS-1:0];
+    end
+end
+
+//------------------------AXI read fsm-------------------
+assign ARREADY = (~ARESET) && (rstate == RDIDLE);
+assign RDATA   = rdata;
+assign RRESP   = 2'b00;  // OKAY
+assign RVALID  = (rstate == RDDATA);
+assign ar_hs   = ARVALID & ARREADY;
+assign raddr   = ARADDR[ADDR_BITS-1:0];
+
+// rstate
+always @(posedge ACLK) begin
+    if (ARESET)
+        rstate <= RDIDLE;
+    else if (ACLK_EN)
+        rstate <= rnext;
+end
+
+// rnext
+always @(*) begin
+    case (rstate)
+        RDIDLE:
+            if (ARVALID)
+                rnext = RDDATA;
+            else
+                rnext = RDIDLE;
+        RDDATA:
+            if (RREADY & RVALID)
+                rnext = RDIDLE;
+            else
+                rnext = RDDATA;
+        default:
+            rnext = RDIDLE;
+    endcase
+end
+
+// rdata
+always @(posedge ACLK) begin
+    if (ACLK_EN) begin
+        if (ar_hs) begin
+            rdata <= 1'b0;
+            case (raddr)
+                ADDR_AP_CTRL: begin
+                    rdata[0] <= int_ap_start;
+                    rdata[1] <= int_ap_done;
+                    rdata[2] <= int_ap_idle;
+                    rdata[3] <= int_ap_ready;
+                    rdata[7] <= int_auto_restart;
+                end
+                ADDR_GIE: begin
+                    rdata <= int_gie;
+                end
+                ADDR_IER: begin
+                    rdata <= int_ier;
+                end
+                ADDR_ISR: begin
+                    rdata <= int_isr;
+                end
+                ADDR_FIFO_IN_DATA_0: begin
+                    rdata <= int_fifo_in[31:0];
+                end
+                ADDR_FIFO_IN_DATA_1: begin
+                    rdata <= int_fifo_in[63:32];
+                end
+                ADDR_FIFO_OUT_DATA_0: begin
+                    rdata <= int_fifo_out[31:0];
+                end
+                ADDR_FIFO_OUT_DATA_1: begin
+                    rdata <= int_fifo_out[63:32];
+                end
+                ADDR_LENGTH_R_IN_DATA_0: begin
+                    rdata <= int_length_r_in[31:0];
+                end
+                ADDR_LENGTH_R_OUT_DATA_0: begin
+                    rdata <= int_length_r_out[31:0];
+                end
+            endcase
+        end
+    end
+end
+
+
+//------------------------Register logic-----------------
+assign interrupt     = int_gie & (|int_isr);
+assign ap_start      = int_ap_start;
+assign int_ap_idle   = ap_idle;
+assign int_ap_ready  = ap_ready;
+assign fifo_in       = int_fifo_in;
+assign fifo_out      = int_fifo_out;
+assign length_r_in   = int_length_r_in;
+assign length_r_out  = int_length_r_out;
+// int_ap_start
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_ap_start <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0] && WDATA[0])
+            int_ap_start <= 1'b1;
+        else if (int_ap_ready)
+            int_ap_start <= int_auto_restart; // clear on handshake/auto restart
+    end
+end
+
+// int_ap_done
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_ap_done <= 1'b0;
+    else if (ACLK_EN) begin
+        if (ap_done)
+            int_ap_done <= 1'b1;
+        else if (ar_hs && raddr == ADDR_AP_CTRL)
+            int_ap_done <= 1'b0; // clear on read
+    end
+end
+
+// int_auto_restart
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_auto_restart <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0])
+            int_auto_restart <=  WDATA[7];
+    end
+end
+
+// int_gie
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_gie <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_GIE && WSTRB[0])
+            int_gie <= WDATA[0];
+    end
+end
+
+// int_ier
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_ier <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_IER && WSTRB[0])
+            int_ier <= WDATA[1:0];
+    end
+end
+
+// int_isr[0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_isr[0] <= 1'b0;
+    else if (ACLK_EN) begin
+        if (int_ier[0] & ap_done)
+            int_isr[0] <= 1'b1;
+        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
+            int_isr[0] <= int_isr[0] ^ WDATA[0]; // toggle on write
+    end
+end
+
+// int_isr[1]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_isr[1] <= 1'b0;
+    else if (ACLK_EN) begin
+        if (int_ier[1] & ap_ready)
+            int_isr[1] <= 1'b1;
+        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
+            int_isr[1] <= int_isr[1] ^ WDATA[1]; // toggle on write
+    end
+end
+
+// int_fifo_in[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_in[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_IN_DATA_0)
+            int_fifo_in[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_in[31:0] & ~wmask);
+    end
+end
+
+// int_fifo_in[63:32]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_in[63:32] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_IN_DATA_1)
+            int_fifo_in[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_in[63:32] & ~wmask);
+    end
+end
+
+// int_fifo_out[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_out[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_0)
+            int_fifo_out[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_out[31:0] & ~wmask);
+    end
+end
+
+// int_fifo_out[63:32]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_out[63:32] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_1)
+            int_fifo_out[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_out[63:32] & ~wmask);
+    end
+end
+
+// int_length_r_in[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_length_r_in[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_LENGTH_R_IN_DATA_0)
+            int_length_r_in[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_in[31:0] & ~wmask);
+    end
+end
+
+
+// int_length_r_out[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_length_r_out[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_LENGTH_R_OUT_DATA_0)
+            int_length_r_out[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_out[31:0] & ~wmask);
+    end
+end
+
+
+//------------------------Memory logic-------------------
+
+endmodule
diff --git a/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
index c589bcf057..b823a7a2e7 100644
--- a/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
+++ b/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
@@ -1,101 +1,101 @@
-from datetime import datetime
-
-import numpy as np
-from pynq import Overlay, allocate
-
-
-class NeuralNetworkOverlay(Overlay):
-    def __init__(self, xclbin_name, dtbo=None, download=True, ignore_version=False, device=None):
-        super().__init__(xclbin_name, dtbo=dtbo, download=download, ignore_version=ignore_version, device=device)
-        self.input_buffer = None
-        self.output_buffer = None
-
-    def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None):
-        """Buffer allocation in the accelerator's memory.
-
-        Args:
-            X_shape (list): Input buffer shape.
-            y_shape (list): Output buffer shape.
-            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
-                numpy dtype. Defaults to np.float32.
-
-                It should be set depending on the interface of the accelerator; if it uses 'float'
-                data type for the 'data' AXI-Stream field, 'np.float32' dtype must be used. Instead if it uses
-                'ap_fixed<A,B>', 'np.intA' is the correct dtype to use. Note that A cannot any integer value, but it can
-                assume power of 2 values, i.e., {..., 8, 16, 32, ...}. Check `numpy` documentation for more information.
-                In this case the encoding/decoding has to be computed by the host machine. For example for
-                'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
-                'float' -> 'ap_fixed<16,6>'::
-
-                    def encode(xi):
-                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
-                    def decode(yi):
-                        return yi * 2**-10
-                    encode_v = np.vectorize(encode) # to apply them element-wise
-                    decode_v = np.vectorize(decode)
-
-            trg_in (optional): Input buffer target memory. By default the v++ command set it to HBM[0] for
-                alveo-u50. Defaults to None.
-            trg_out (optional): Output buffer target memory. By default the v++ command set it to HBM[0] for
-                alveo-u50. Defaults to None.
-        """
-        self.input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in)
-        self.output_buffer = allocate(shape=y_shape, dtype=dtype, target=trg_out)
-
-    def predict(self, X, y_shape, dtype=np.float32, debug=False, profile=False, encode=None, decode=None):
-        """Obtain the predictions of the NN implemented in the FPGA.
-
-        Args:
-            X (ndarray): The input tensor.
-            y_shape (list): The shape of the output tensor, needed by the accelerator to set the TLAST bit properly.
-            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
-                numpy dtype. Defaults to np.float32.
-            debug (bool, optional): If set, the function will print information about the data transfers status.
-                Defaults to False.
-            profile (bool, optional): If set, the function will print the performance of the algorithm in terms of
-                inference/s. Defaults to False.
-            encode (Callable, optional): Function to transform the input tensor. Defaults to None.
-            decode (Callable, optional): Function to transform the output tensor. Defaults to None.
-
-        Returns:
-            _type_: A ``np.ndarray`` with a shape equal of ``y_shape`` and ``dtype`` data type.
-        """
-        self.allocate_mem(X_shape=X.shape, y_shape=y_shape, dtype=dtype)
-        if profile:
-            timea = datetime.now()
-        if encode is not None:
-            X = encode(X)
-        in_size = np.prod(X.shape)
-        out_size = np.prod(y_shape)
-        self.input_buffer[:] = X
-        self.input_buffer.sync_to_device()
-        if debug:
-            print("Send OK")
-        self.krnl_rtl_1.call(self.input_buffer, self.output_buffer, in_size, out_size)
-        if debug:
-            print("Kernel call OK")
-        self.output_buffer.sync_from_device()
-        if debug:
-            print("Recieve OK")
-        result = self.output_buffer.copy()
-        if profile:
-            timeb = datetime.now()
-            dts, rate = self._print_dt(timea, timeb, len(X))
-            self.input_buffer.flush()
-            self.output_buffer.flush()
-            self.free()
-            return result, dts, rate
-        self.input_buffer.flush()
-        self.output_buffer.flush()
-        return result
-
-    def free_overlay(self):
-        self.free()
-
-    def _print_dt(self, timea, timeb, N):
-        dt = timeb - timea
-        dts = dt.seconds + dt.microseconds * 10**-6
-        rate = N / dts
-        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
-        print(f"Or {1 / rate * 1e6} us / inferences")
-        return dts, rate
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, xclbin_name, dtbo=None, download=True, ignore_version=False, device=None):
+        super().__init__(xclbin_name, dtbo=dtbo, download=download, ignore_version=ignore_version, device=device)
+        self.input_buffer = None
+        self.output_buffer = None
+
+    def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None):
+        """Buffer allocation in the accelerator's memory.
+
+        Args:
+            X_shape (list): Input buffer shape.
+            y_shape (list): Output buffer shape.
+            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
+                numpy dtype. Defaults to np.float32.
+
+                It should be set depending on the interface of the accelerator; if it uses 'float'
+                data type for the 'data' AXI-Stream field, 'np.float32' dtype must be used. Instead if it uses
+                'ap_fixed<A,B>', 'np.intA' is the correct dtype to use. Note that A cannot any integer value, but it can
+                assume power of 2 values, i.e., {..., 8, 16, 32, ...}. Check `numpy` documentation for more information.
+                In this case the encoding/decoding has to be computed by the host machine. For example for
+                'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                'float' -> 'ap_fixed<16,6>'::
+
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+
+            trg_in (optional): Input buffer target memory. By default the v++ command set it to HBM[0] for
+                alveo-u50. Defaults to None.
+            trg_out (optional): Output buffer target memory. By default the v++ command set it to HBM[0] for
+                alveo-u50. Defaults to None.
+        """
+        self.input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype, target=trg_out)
+
+    def predict(self, X, y_shape, dtype=np.float32, debug=False, profile=False, encode=None, decode=None):
+        """Obtain the predictions of the NN implemented in the FPGA.
+
+        Args:
+            X (ndarray): The input tensor.
+            y_shape (list): The shape of the output tensor, needed by the accelerator to set the TLAST bit properly.
+            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
+                numpy dtype. Defaults to np.float32.
+            debug (bool, optional): If set, the function will print information about the data transfers status.
+                Defaults to False.
+            profile (bool, optional): If set, the function will print the performance of the algorithm in terms of
+                inference/s. Defaults to False.
+            encode (Callable, optional): Function to transform the input tensor. Defaults to None.
+            decode (Callable, optional): Function to transform the output tensor. Defaults to None.
+
+        Returns:
+            _type_: A ``np.ndarray`` with a shape equal of ``y_shape`` and ``dtype`` data type.
+        """
+        self.allocate_mem(X_shape=X.shape, y_shape=y_shape, dtype=dtype)
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        in_size = np.prod(X.shape)
+        out_size = np.prod(y_shape)
+        self.input_buffer[:] = X
+        self.input_buffer.sync_to_device()
+        if debug:
+            print("Send OK")
+        self.krnl_rtl_1.call(self.input_buffer, self.output_buffer, in_size, out_size)
+        if debug:
+            print("Kernel call OK")
+        self.output_buffer.sync_from_device()
+        if debug:
+            print("Recieve OK")
+        result = self.output_buffer.copy()
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            self.input_buffer.flush()
+            self.output_buffer.flush()
+            self.free()
+            return result, dts, rate
+        self.input_buffer.flush()
+        self.output_buffer.flush()
+        return result
+
+    def free_overlay(self):
+        self.free()
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        print(f"Or {1 / rate * 1e6} us / inferences")
+        return dts, rate
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
index c14aafb8cb..b704c2e0a8 100644
--- a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
@@ -1,26 +1,26 @@
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
-
-set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
-set_property  ip_repo_paths  ${project_name}_prj [current_project]
-update_ip_catalog
-
-# Create Block Designer design
-create_bd_design "design_1"
-create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
-apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
-create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS]
-
-make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
-add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
-
-reset_run impl_1
-reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
-
-open_run impl_1
-report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+# Create Block Designer design
+create_bd_design "design_1"
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
index c5549dc256..de86ff4b74 100644
--- a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -1,59 +1,59 @@
-#@todo: try to remove startgroup and endgroup and see if it work
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
-
-set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
-set_property  ip_repo_paths  ${project_name}_prj [current_project]
-update_ip_catalog
-
-create_bd_design "design_1"
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
-endgroup
-
-apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
-
-startgroup
-set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
-endgroup
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
-endgroup
-
-set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
-set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
-
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
-
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
-endgroup
-
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
-endgroup
-
-connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
-connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
-
-apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
-
-group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
-
-make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
-
-add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
-
-reset_run impl_1
-reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
-
-open_run impl_1
-report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+
+startgroup
+set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
index 5d886c6f25..033a12d913 100644
--- a/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -1,58 +1,58 @@
-#@todo: try to remove startgroup and endgroup and see if it work
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
-
-set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
-set_property  ip_repo_paths  ${project_name}_prj [current_project]
-update_ip_catalog
-
-create_bd_design "design_1"
-set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
-update_ip_catalog
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
-endgroup
-
-apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
-
-set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
-endgroup
-set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
-set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
-
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
-endgroup
-
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
-endgroup
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
-endgroup
-connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
-connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
-
-apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
-group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
-
-make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
-
-add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
-
-reset_run impl_1
-reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
-
-open_run impl_1
-report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
+
+set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+endgroup
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 5cd17d02e9..8a88493ca2 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -72,7 +72,11 @@ def _get_precision_from_quantizer(quantizer):
     overflow = "AP_WRAP"
 
     if quantizer['class_name'] in supported_quantizers:
-        bits = int(quantizer['config']['bits'])
+        bits = quantizer['config']['bits']
+        if isinstance(bits, list):
+            bits = int(bits[0])
+        else:
+            bits = int(bits)
         # if integer isn't specified, it should be the same as bits
         integer = int(quantizer['config'].get('integer', bits - 1)) + 1
         # for quantizers use the following default rounding and overflow
diff --git a/hls4ml/utils/plot.py b/hls4ml/utils/plot.py
index e3424bb1ad..24dd4f1cb3 100644
--- a/hls4ml/utils/plot.py
+++ b/hls4ml/utils/plot.py
@@ -1,224 +1,224 @@
-# Heavily inspired by Keras's plot_model
-"""Utilities related to model visualization."""
-
-import os
-import sys
-
-try:
-    import pydot
-except ImportError:
-    pydot = None
-
-
-def check_pydot():
-    """Returns True if PyDot and Graphviz are available."""
-    if pydot is None:
-        return False
-    try:
-        # Attempt to create an image of a blank graph
-        # to check the pydot/graphviz installation.
-        pydot.Dot.create(pydot.Dot())
-        return True
-    except OSError:
-        return False
-
-
-def add_edge(dot, src, dst):
-    if not dot.get_edge(src, dst):
-        dot.add_edge(pydot.Edge(src, dst))
-
-
-def model_to_dot(
-    model, show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96, subgraph=False
-):
-    """Convert a HLS model to dot format.
-
-    Arguments:
-        model: A HLS model instance.
-        show_shapes: whether to display shape information.
-        show_layer_names: whether to display layer names.
-        show_precision: whether to display precision of layer's variables.
-        rankdir: `rankdir` argument passed to PyDot,
-            a string specifying the format of the plot:
-            'TB' creates a vertical plot;
-            'LR' creates a horizontal plot.
-        dpi: Dots per inch.
-        subgraph: whether to return a `pydot.Cluster` instance.
-
-    Returns:
-        A `pydot.Dot` instance representing the HLS model or
-        a `pydot.Cluster` instance representing nested model if
-        `subgraph=True`.
-
-    Raises:
-        ImportError: if graphviz or pydot are not available.
-    """
-
-    if not check_pydot():
-        if 'IPython.core.magics.namespace' in sys.modules:
-            # We don't raise an exception here in order to avoid crashing notebook
-            # tests where graphviz is not available.
-            print('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
-            return
-        else:
-            raise ImportError('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
-
-    if subgraph:
-        dot = pydot.Cluster(style='dashed', graph_name=model.name)
-        dot.set('label', model.name)
-        dot.set('labeljust', 'l')
-    else:
-        dot = pydot.Dot()
-        dot.set('rankdir', rankdir)
-        dot.set('concentrate', True)
-        dot.set('dpi', dpi)
-        dot.set_node_defaults(shape='record')
-
-    layers = model.get_layers()
-
-    # Create graph nodes.
-    for i, layer in enumerate(layers):
-        # layer_id = str(id(layer))
-        layer_id = str(layer.index)
-
-        # Append a wrapped layer's label to node's label, if it exists.
-        layer_name = layer.name
-        class_name = layer.class_name
-
-        # Create node's label.
-        if show_layer_names:
-            # label = '{}: {}'.format(class_name, layer_name)
-            # label = '{}\\l{}\\l'.format(class_name, layer_name)
-            label = f'<b>{class_name}</b><br align="left" />{layer_name}'
-        else:
-            label = class_name
-
-        # Rebuild the label as a table including input/output shapes.
-        if show_shapes:
-
-            def format_shape(shape):
-                return str(tuple(shape)).replace(str(None), '?')
-
-            input_labels = '?'
-            try:
-                output_labels = format_shape(layer.get_output_variable().shape)
-            except AttributeError:
-                output_labels = '?'
-            if class_name != 'Input':
-                if len(layer.inputs) > 1:
-                    input_shapes = []
-                    for i in layer.inputs:
-                        input_layer = layer.get_input_variable(i)
-                        if input_layer is not None:
-                            input_shapes.append(input_layer.shape)
-                        else:
-                            input_shapes.append('?')
-                    formatted_shapes = [format_shape(ishape) for ishape in input_shapes]
-                    input_labels = ', '.join(formatted_shapes)
-                else:
-                    input_layer = layer.get_input_variable()
-                    if input_layer is not None:
-                        input_labels = format_shape(input_layer.shape)
-            label = f'{label}\n|{{input: {input_labels}|output: {output_labels}}}'
-
-        # Rebuild the label as a table including tensor precision.
-        if show_precision:
-
-            def format_precision(precision):
-                return str(precision).replace('<', '&lt;').replace('>', '&gt;')
-
-            precision_labels = []
-            tensors = {}
-            tensors.update(layer.weights)
-            if len(layer.variables) == 1:
-                # A bit cleaner output
-                tensors['output'] = layer.get_output_variable()
-            else:
-                tensors.update(layer.variables)
-            for tensor_name, var in tensors.items():
-                if show_shapes:
-                    # tensor_label = '{} {}: {}'.format(tensor_name,
-                    tensor_label = '<tr><td align="left">{} {}:</td><td align="left">{}</td></tr>'.format(
-                        tensor_name, format_shape(var.shape), format_precision(var.type.precision)
-                    )
-                else:
-                    # tensor_label = '{}: {}'.format(tensor_name,
-                    tensor_label = '<tr><td align="left">{}:</td><td align="left">{}</td></tr>'.format(
-                        tensor_name, format_precision(var.type.precision)
-                    )
-                precision_labels.append(tensor_label)
-            # precision_label = '<br align="left" />'.join(precision_labels)
-            precision_label = ''.join(precision_labels)
-            precision_label = '<table border="0" cellspacing="0">' + precision_label + '</table>'
-            label = f'{label}|{{{precision_label}}}'
-
-        label = '<' + label + '>'
-        node = pydot.Node(layer_id, label=label)
-        dot.add_node(node)
-
-    # Connect nodes with edges.
-    for layer in layers:
-        layer_id = str(layer.index)
-        for input_name in layer.inputs:
-            input_layer = layer.get_input_node(input_name)
-            if input_layer is not None:
-                input_layer_id = str(input_layer.index)
-                add_edge(dot, input_layer_id, layer_id)
-
-    return dot
-
-
-def plot_model(
-    model, to_file='model.png', show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96
-):
-    """Converts a HLS model to dot format and save to a file.
-
-    Arguments:
-        model: A HLS model instance
-        to_file: File name of the plot image.
-        show_shapes: whether to display shape information.
-        show_layer_names: whether to display layer names.
-        show_precision: whether to display precision of layer's variables.
-        rankdir: `rankdir` argument passed to PyDot,
-            a string specifying the format of the plot:
-            'TB' creates a vertical plot;
-            'LR' creates a horizontal plot.
-        dpi: Dots per inch.
-
-    Returns:
-        A Jupyter notebook Image object if Jupyter is installed.
-        This enables in-line display of the model plots in notebooks.
-    """
-    dot = model_to_dot(
-        model,
-        show_shapes=show_shapes,
-        show_layer_names=show_layer_names,
-        show_precision=show_precision,
-        rankdir=rankdir,
-        dpi=dpi,
-    )
-    if dot is None:
-        return
-
-    if to_file is not None:
-        _, extension = os.path.splitext(to_file)
-        if not extension:
-            extension = 'png'
-        else:
-            extension = extension[1:]
-        # Save image to disk.
-        dot.write(to_file, format=extension)
-    else:
-        # Return the image as a Jupyter Image object, to be displayed in-line.
-        # Note that we cannot easily detect whether the code is running in a
-        # notebook, and thus we always return the Image if Jupyter is available.
-        try:
-            import tempfile
-
-            from IPython import display
-
-            temp = tempfile.NamedTemporaryFile(suffix='.png')
-            dot.write(temp.name, format='png')
-            return display.Image(filename=temp.name)
-        except ImportError:
-            pass
+# Heavily inspired by Keras's plot_model
+"""Utilities related to model visualization."""
+
+import os
+import sys
+
+try:
+    import pydot
+except ImportError:
+    pydot = None
+
+
+def check_pydot():
+    """Returns True if PyDot and Graphviz are available."""
+    if pydot is None:
+        return False
+    try:
+        # Attempt to create an image of a blank graph
+        # to check the pydot/graphviz installation.
+        pydot.Dot.create(pydot.Dot())
+        return True
+    except OSError:
+        return False
+
+
+def add_edge(dot, src, dst):
+    if not dot.get_edge(src, dst):
+        dot.add_edge(pydot.Edge(src, dst))
+
+
+def model_to_dot(
+    model, show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96, subgraph=False
+):
+    """Convert a HLS model to dot format.
+
+    Arguments:
+        model: A HLS model instance.
+        show_shapes: whether to display shape information.
+        show_layer_names: whether to display layer names.
+        show_precision: whether to display precision of layer's variables.
+        rankdir: `rankdir` argument passed to PyDot,
+            a string specifying the format of the plot:
+            'TB' creates a vertical plot;
+            'LR' creates a horizontal plot.
+        dpi: Dots per inch.
+        subgraph: whether to return a `pydot.Cluster` instance.
+
+    Returns:
+        A `pydot.Dot` instance representing the HLS model or
+        a `pydot.Cluster` instance representing nested model if
+        `subgraph=True`.
+
+    Raises:
+        ImportError: if graphviz or pydot are not available.
+    """
+
+    if not check_pydot():
+        if 'IPython.core.magics.namespace' in sys.modules:
+            # We don't raise an exception here in order to avoid crashing notebook
+            # tests where graphviz is not available.
+            print('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
+            return
+        else:
+            raise ImportError('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
+
+    if subgraph:
+        dot = pydot.Cluster(style='dashed', graph_name=model.name)
+        dot.set('label', model.name)
+        dot.set('labeljust', 'l')
+    else:
+        dot = pydot.Dot()
+        dot.set('rankdir', rankdir)
+        dot.set('concentrate', True)
+        dot.set('dpi', dpi)
+        dot.set_node_defaults(shape='record')
+
+    layers = model.get_layers()
+
+    # Create graph nodes.
+    for i, layer in enumerate(layers):
+        # layer_id = str(id(layer))
+        layer_id = str(layer.index)
+
+        # Append a wrapped layer's label to node's label, if it exists.
+        layer_name = layer.name
+        class_name = layer.class_name
+
+        # Create node's label.
+        if show_layer_names:
+            # label = '{}: {}'.format(class_name, layer_name)
+            # label = '{}\\l{}\\l'.format(class_name, layer_name)
+            label = f'<b>{class_name}</b><br align="left" />{layer_name}'
+        else:
+            label = class_name
+
+        # Rebuild the label as a table including input/output shapes.
+        if show_shapes:
+
+            def format_shape(shape):
+                return str(tuple(shape)).replace(str(None), '?')
+
+            input_labels = '?'
+            try:
+                output_labels = format_shape(layer.get_output_variable().shape)
+            except AttributeError:
+                output_labels = '?'
+            if class_name != 'Input':
+                if len(layer.inputs) > 1:
+                    input_shapes = []
+                    for i in layer.inputs:
+                        input_layer = layer.get_input_variable(i)
+                        if input_layer is not None:
+                            input_shapes.append(input_layer.shape)
+                        else:
+                            input_shapes.append('?')
+                    formatted_shapes = [format_shape(ishape) for ishape in input_shapes]
+                    input_labels = ', '.join(formatted_shapes)
+                else:
+                    input_layer = layer.get_input_variable()
+                    if input_layer is not None:
+                        input_labels = format_shape(input_layer.shape)
+            label = f'{label}\n|{{input: {input_labels}|output: {output_labels}}}'
+
+        # Rebuild the label as a table including tensor precision.
+        if show_precision:
+
+            def format_precision(precision):
+                return str(precision).replace('<', '&lt;').replace('>', '&gt;')
+
+            precision_labels = []
+            tensors = {}
+            tensors.update(layer.weights)
+            if len(layer.variables) == 1:
+                # A bit cleaner output
+                tensors['output'] = layer.get_output_variable()
+            else:
+                tensors.update(layer.variables)
+            for tensor_name, var in tensors.items():
+                if show_shapes:
+                    # tensor_label = '{} {}: {}'.format(tensor_name,
+                    tensor_label = '<tr><td align="left">{} {}:</td><td align="left">{}</td></tr>'.format(
+                        tensor_name, format_shape(var.shape), format_precision(var.type.precision)
+                    )
+                else:
+                    # tensor_label = '{}: {}'.format(tensor_name,
+                    tensor_label = '<tr><td align="left">{}:</td><td align="left">{}</td></tr>'.format(
+                        tensor_name, format_precision(var.type.precision)
+                    )
+                precision_labels.append(tensor_label)
+            # precision_label = '<br align="left" />'.join(precision_labels)
+            precision_label = ''.join(precision_labels)
+            precision_label = '<table border="0" cellspacing="0">' + precision_label + '</table>'
+            label = f'{label}|{{{precision_label}}}'
+
+        label = '<' + label + '>'
+        node = pydot.Node(layer_id, label=label)
+        dot.add_node(node)
+
+    # Connect nodes with edges.
+    for layer in layers:
+        layer_id = str(layer.index)
+        for input_name in layer.inputs:
+            input_layer = layer.get_input_node(input_name)
+            if input_layer is not None:
+                input_layer_id = str(input_layer.index)
+                add_edge(dot, input_layer_id, layer_id)
+
+    return dot
+
+
+def plot_model(
+    model, to_file='model.png', show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96
+):
+    """Converts a HLS model to dot format and save to a file.
+
+    Arguments:
+        model: A HLS model instance
+        to_file: File name of the plot image.
+        show_shapes: whether to display shape information.
+        show_layer_names: whether to display layer names.
+        show_precision: whether to display precision of layer's variables.
+        rankdir: `rankdir` argument passed to PyDot,
+            a string specifying the format of the plot:
+            'TB' creates a vertical plot;
+            'LR' creates a horizontal plot.
+        dpi: Dots per inch.
+
+    Returns:
+        A Jupyter notebook Image object if Jupyter is installed.
+        This enables in-line display of the model plots in notebooks.
+    """
+    dot = model_to_dot(
+        model,
+        show_shapes=show_shapes,
+        show_layer_names=show_layer_names,
+        show_precision=show_precision,
+        rankdir=rankdir,
+        dpi=dpi,
+    )
+    if dot is None:
+        return
+
+    if to_file is not None:
+        _, extension = os.path.splitext(to_file)
+        if not extension:
+            extension = 'png'
+        else:
+            extension = extension[1:]
+        # Save image to disk.
+        dot.write(to_file, format=extension)
+    else:
+        # Return the image as a Jupyter Image object, to be displayed in-line.
+        # Note that we cannot easily detect whether the code is running in a
+        # notebook, and thus we always return the Image if Jupyter is available.
+        try:
+            import tempfile
+
+            from IPython import display
+
+            temp = tempfile.NamedTemporaryFile(suffix='.png')
+            dot.write(temp.name, format='png')
+            return display.Image(filename=temp.name)
+        except ImportError:
+            pass
diff --git a/hs_err_pid6927.log b/hs_err_pid6927.log
deleted file mode 100644
index 108140956b..0000000000
--- a/hs_err_pid6927.log
+++ /dev/null
@@ -1,17 +0,0 @@
-#
-# An unexpected error has occurred (11)
-#
-Stack:
-/lib/x86_64-linux-gnu/libc.so.6(+0x3ef10) [0x7fe2289e5f10]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(+0xb9519) [0x7fe22428e519]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_ResetResult+0x10) [0x7fe22428ef20]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(+0x34429) [0x7fe224209429]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_EvalEx+0x13) [0x7fe22420a0a3]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Eval+0x15) [0x7fe22420a0c5]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libhls_support.so(+0x75ac0) [0x7fe21094bac0]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Finalize+0x49) [0x7fe224249199]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Exit+0x4a) [0x7fe22424934a]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/libtcl8.5.so(Tcl_Main+0x24e) [0x7fe22427528e]
-/opt/Xilinx/Vivado/2019.2/lib/lnx64.o/librdi_common.so(+0x8b30cb) [0x7fe229dfd0cb]
-/lib/x86_64-linux-gnu/libpthread.so.0(+0x76db) [0x7fe2221936db]
-/lib/x86_64-linux-gnu/libc.so.6(clone+0x3f) [0x7fe228ac861f]
diff --git a/test/pytest/test_precision_parsing.py b/test/pytest/test_precision_parsing.py
new file mode 100644
index 0000000000..5569a3a6ad
--- /dev/null
+++ b/test/pytest/test_precision_parsing.py
@@ -0,0 +1,29 @@
+import pytest
+
+import hls4ml
+
+
+@pytest.mark.parametrize(
+    'prec_pair',
+    [
+        ('ap_fixed<3, 2>', True),
+        ('ap_ufixed<3, 2>', False),
+        ('ac_fixed<3, 2, true>', True),
+        ('ac_fixed<3, 2, false>', False),
+        ('ac_fixed<3, 2, 1>', True),
+        ('ac_fixed<3, 2, 0>', False),
+        ('ap_int<3, 2>', True),
+        ('ap_uint<3>', False),
+        ('ac_int<3, TRue>', True),
+        ('ac_int<3, FALse>', False),
+        ('ac_int<3, 1>', True),
+        ('ac_int<3, 0>', False),
+    ],
+)
+def test_sign_parsing(prec_pair):
+    '''Test that convert_precions_string determines the signedness correctly'''
+    strprec = prec_pair[0]
+    signed = prec_pair[1]
+
+    evalprec = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(strprec)
+    assert evalprec.signed == signed
diff --git a/vivado.jou b/vivado.jou
deleted file mode 100644
index 3828028f65..0000000000
--- a/vivado.jou
+++ /dev/null
@@ -1,21 +0,0 @@
-#-----------------------------------------------------------
-# Vivado v2019.2 (64-bit)
-# SW Build 2708876 on Wed Nov  6 21:39:14 MST 2019
-# IP Build 2700528 on Thu Nov  7 00:09:20 MST 2019
-# Start of session at: Mon Apr 17 20:02:54 2023
-# Process ID: 963
-# Current directory: /home/ej/workspace/hls4ml/hls4ml
-# Command line: vivado
-# Log file: /home/ej/workspace/hls4ml/hls4ml/vivado.log
-# Journal file: /home/ej/workspace/hls4ml/hls4ml/vivado.jou
-#-----------------------------------------------------------
-start_gui
-create_project BDT_vivado /home/ej/workspace/fwX/BDT_vivado -part xcvu9p-flga2104-2L-e
-set_property board_part xilinx.com:vcu118:part0:2.3 [current_project]
-add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V_rom.dat}
-add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_32_12_udo.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mac_muladdwdI.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_42_12_vdy.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs.v}
-update_compile_order -fileset sources_1
-update_compile_order -fileset sources_1
-launch_runs impl_1 -jobs 8
-wait_on_run impl_1
-open_run impl_1
diff --git a/vivado.log b/vivado.log
deleted file mode 100644
index 3a5702e20b..0000000000
--- a/vivado.log
+++ /dev/null
@@ -1,52 +0,0 @@
-#-----------------------------------------------------------
-# Vivado v2019.2 (64-bit)
-# SW Build 2708876 on Wed Nov  6 21:39:14 MST 2019
-# IP Build 2700528 on Thu Nov  7 00:09:20 MST 2019
-# Start of session at: Mon Apr 17 20:02:54 2023
-# Process ID: 963
-# Current directory: /home/ej/workspace/hls4ml/hls4ml
-# Command line: vivado
-# Log file: /home/ej/workspace/hls4ml/hls4ml/vivado.log
-# Journal file: /home/ej/workspace/hls4ml/hls4ml/vivado.jou
-#-----------------------------------------------------------
-start_gui
-create_project BDT_vivado /home/ej/workspace/fwX/BDT_vivado -part xcvu9p-flga2104-2L-e
-INFO: [IP_Flow 19-234] Refreshing IP repositories
-INFO: [IP_Flow 19-1704] No user IP repositories specified
-INFO: [IP_Flow 19-2313] Loaded Vivado IP repository '/opt/Xilinx/Vivado/2019.2/data/ip'.
-create_project: Time (s): cpu = 00:00:07 ; elapsed = 00:00:12 . Memory (MB): peak = 6734.895 ; gain = 33.020 ; free physical = 15830 ; free virtual = 33929
-set_property board_part xilinx.com:vcu118:part0:2.3 [current_project]
-add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb_rom.dat /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V_rom.dat}
-add_files -norecurse -scan_for_includes {/home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores3_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mpcA.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mfYi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mg8j.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mqcK.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_32_12_udo.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_msc4.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores4_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mjbC.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mbkb.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores5_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mdEe.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mac_muladdwdI.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mrcU.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mmb6.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores6_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mtde.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores7_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mocq.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_mux_42_12_vdy.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mhbi.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mcud.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_meOg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores8_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mncg.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores0_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mlbW.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores9_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores1_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mkbM.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/fwXbdt_scores2_V.v /home/ej/workspace/fwX/2_20_bdt/bdt_binary/bdt_hls_firmware/solution1/syn/verilog/getValue_binPtr_mibs.v}
-update_compile_order -fileset sources_1
-update_compile_order -fileset sources_1
-launch_runs impl_1 -jobs 8
-[Mon Apr 17 20:07:26 2023] Launched synth_1...
-Run output will be captured here: /home/ej/workspace/fwX/BDT_vivado/BDT_vivado.runs/synth_1/runme.log
-[Mon Apr 17 20:07:26 2023] Launched impl_1...
-Run output will be captured here: /home/ej/workspace/fwX/BDT_vivado/BDT_vivado.runs/impl_1/runme.log
-CRITICAL WARNING: [Common 17-1649] The Vivado message database '/home/ej/workspace/fwX/BDT_vivado/BDT_vivado.runs/synth_1/vivado.pb' contains 24975 messages. Restoring all messages from this message database will impact Vivado performance, so only WARNING, CRITICAL WARNING, and ERROR messages will be restored. To restore all messages from this file use the tcl command 'set_param messaging.loadPbLimit 24976' and re-open the project.
-open_run impl_1
-INFO: [Device 21-403] Loading part xcvu9p-flga2104-2L-e
-Netlist sorting complete. Time (s): cpu = 00:00:00.01 ; elapsed = 00:00:00 . Memory (MB): peak = 7651.996 ; gain = 0.000 ; free physical = 13960 ; free virtual = 32677
-INFO: [Netlist 29-17] Analyzing 272 Unisim elements for replacement
-INFO: [Netlist 29-28] Unisim Transformation completed in 0 CPU seconds
-WARNING: [Netlist 29-101] Netlist 'fwXbdt' is not ideal for floorplanning, since the cellview 'fwXbdt' contains a large number of primitives.  Please consider enabling hierarchy in synthesis if you want to do floorplanning.
-INFO: [Project 1-479] Netlist was created with Vivado 2019.2
-INFO: [Project 1-570] Preparing netlist for logic optimization
-Reading XDEF placement.
-Reading placer database...
-Reading XDEF routing.
-Read XDEF File: Time (s): cpu = 00:00:00.08 ; elapsed = 00:00:00.09 . Memory (MB): peak = 7708.898 ; gain = 2.000 ; free physical = 13855 ; free virtual = 32573
-Restored from archive | CPU: 0.080000 secs | Memory: 2.751320 MB |
-Finished XDEF File Restore: Time (s): cpu = 00:00:00.08 ; elapsed = 00:00:00.09 . Memory (MB): peak = 7708.898 ; gain = 2.000 ; free physical = 13855 ; free virtual = 32573
-Netlist sorting complete. Time (s): cpu = 00:00:00 ; elapsed = 00:00:00 . Memory (MB): peak = 7732.711 ; gain = 0.000 ; free physical = 13855 ; free virtual = 32573
-INFO: [Project 1-111] Unisim Transformation Summary:
-  A total of 40 instances were transformed.
-  DSP48E2 => DSP48E2 (DSP_ALU, DSP_A_B_DATA, DSP_C_DATA, DSP_MULTIPLIER, DSP_M_DATA, DSP_OUTPUT, DSP_PREADD, DSP_PREADD_DATA): 2 instances
-  IBUF => IBUF (IBUFCTRL, INBUF): 38 instances
-
-open_run: Time (s): cpu = 00:00:19 ; elapsed = 00:00:25 . Memory (MB): peak = 8235.492 ; gain = 1086.348 ; free physical = 13483 ; free virtual = 32204
-WARNING: [Timing 38-313] There are no user specified timing constraints. Timing constraints are needed for proper timing analysis.
-exit
-INFO: [Common 17-206] Exiting Vivado at Mon Apr 17 20:24:36 2023...

From d28b24c181a62b20e809d92c414266a1a24ae35e Mon Sep 17 00:00:00 2001
From: LostEcho365 <lostecho@uw.edu>
Date: Mon, 7 Aug 2023 21:44:54 +0800
Subject: [PATCH 31/55] Added support on QMultiHeadAttention,
 QLayerNormalization, and quantized_softmax

---
 .idea/hls4ml.iml                              |  14 +
 .idea/misc.xml                                |   7 +
 .idea/workspace.xml                           | 164 ++++
 hls4ml/.idea/.gitignore                       |   3 +
 hls4ml/.idea/hls4ml.iml                       |  12 +
 .../inspectionProfiles/profiles_settings.xml  |   6 +
 hls4ml/.idea/misc.xml                         |  10 +
 hls4ml/.idea/modules.xml                      |   8 +
 hls4ml/.idea/vcs.xml                          |   6 +
 hls4ml/converters/keras/qkeras_layers.py      | 143 +++
 hls4ml/converters/tf_to_hls.py                |   0
 .../model/optimizer/passes/precision_merge.py |   0
 .../templates/quartus/firmware/defines.h.bak  |  47 +
 .../quartus/firmware/myproject.cpp.bak        |  48 ++
 .../quartus/firmware/myproject.h.bak          |  48 ++
 .../firmware/nnet_utils/nnet_batchnorm.h.bak  | 104 +++
 .../firmware/nnet_utils/nnet_common.h.bak     |  71 ++
 .../firmware/nnet_utils/nnet_conv1d.h.bak     |  64 ++
 .../firmware/nnet_utils/nnet_dense.h.bak      | 169 ++++
 .../nnet_utils/nnet_dense_compressed.h.bak    |  80 ++
 .../firmware/nnet_utils/nnet_helpers.h.bak    | 140 +++
 .../firmware/nnet_utils/nnet_merge.h.bak      | 249 ++++++
 .../firmware/nnet_utils/nnet_mult.h.bak       | 113 +++
 .../firmware/nnet_utils/nnet_padding.h.bak    |  99 +++
 .../quartus/myproject_test_parallel.cpp.bak   | 112 +++
 .../vivado/firmware/myproject.cpp.bak         |  23 +
 .../templates/vivado/firmware/myproject.h.bak |  19 +
 .../templates/vivado/myproject_test.cpp.bak   |  94 ++
 .../vivado/nnet_utils/nnet_activation.h.bak   | 795 +++++++++++++++++
 .../vivado/nnet_utils/nnet_array.h.bak        |  52 ++
 .../vivado/nnet_utils/nnet_batchnorm.h.bak    | 124 +++
 .../nnet_utils/nnet_batchnorm_stream.h.bak    | 123 +++
 .../vivado/nnet_utils/nnet_common.h.bak       |  75 ++
 .../vivado/nnet_utils/nnet_conv1d.h.bak       |  66 ++
 .../nnet_utils/nnet_conv1d_stream.h.bak       |  89 ++
 .../vivado/nnet_utils/nnet_conv2d.h.bak       |  75 ++
 .../nnet_utils/nnet_conv2d_latency.h.bak      |  89 ++
 .../vivado/nnet_utils/nnet_dense.h.bak        |  60 ++
 .../nnet_utils/nnet_dense_compressed.h.bak    |  90 ++
 .../nnet_utils/nnet_dense_latency.h.bak       |  72 ++
 .../nnet_utils/nnet_dense_resource.h.bak      | 263 ++++++
 .../vivado/nnet_utils/nnet_dense_seq.h.bak    |  44 +
 .../vivado/nnet_utils/nnet_garnet.h.bak       | 816 ++++++++++++++++++
 .../vivado/nnet_utils/nnet_helpers.h.bak      | 382 ++++++++
 .../vivado/nnet_utils/nnet_layernorm.h.bak    | 404 +++++++++
 .../vivado/nnet_utils/nnet_merge.h.bak        | 256 ++++++
 .../vivado/nnet_utils/nnet_merge_stream.h.bak | 370 ++++++++
 .../vivado/nnet_utils/nnet_mult.h.bak         | 116 +++
 .../nnet_utils/nnet_multiheadattention.h.bak  | 337 ++++++++
 .../vivado/nnet_utils/nnet_padding.h.bak      | 145 ++++
 .../vivado/nnet_utils/nnet_pooling.h.bak      | 313 +++++++
 .../nnet_utils/nnet_recr_activations.h.bak    |  56 ++
 .../vivado/nnet_utils/nnet_recurrent.h.bak    | 586 +++++++++++++
 test/docker/README.md                         |  72 ++
 test/docker/install_config-2017.2.txt         |  28 +
 test/docker/install_config.txt                |  28 +
 test/pytest/test_cnn_mnist.py                 |  93 ++
 57 files changed, 7872 insertions(+)
 create mode 100644 .idea/hls4ml.iml
 create mode 100644 .idea/misc.xml
 create mode 100644 .idea/workspace.xml
 create mode 100644 hls4ml/.idea/.gitignore
 create mode 100644 hls4ml/.idea/hls4ml.iml
 create mode 100644 hls4ml/.idea/inspectionProfiles/profiles_settings.xml
 create mode 100644 hls4ml/.idea/misc.xml
 create mode 100644 hls4ml/.idea/modules.xml
 create mode 100644 hls4ml/.idea/vcs.xml
 create mode 100644 hls4ml/converters/keras/qkeras_layers.py
 create mode 100644 hls4ml/converters/tf_to_hls.py
 create mode 100644 hls4ml/model/optimizer/passes/precision_merge.py
 create mode 100644 hls4ml/templates/quartus/firmware/defines.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/myproject.cpp.bak
 create mode 100644 hls4ml/templates/quartus/firmware/myproject.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak
 create mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak
 create mode 100644 hls4ml/templates/quartus/myproject_test_parallel.cpp.bak
 create mode 100644 hls4ml/templates/vivado/firmware/myproject.cpp.bak
 create mode 100644 hls4ml/templates/vivado/firmware/myproject.h.bak
 create mode 100644 hls4ml/templates/vivado/myproject_test.cpp.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak
 create mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak
 create mode 100644 test/docker/README.md
 create mode 100644 test/docker/install_config-2017.2.txt
 create mode 100644 test/docker/install_config.txt
 create mode 100644 test/pytest/test_cnn_mnist.py

diff --git a/.idea/hls4ml.iml b/.idea/hls4ml.iml
new file mode 100644
index 0000000000..57be99f6ff
--- /dev/null
+++ b/.idea/hls4ml.iml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <orderEntry type="jdk" jdkName="Python 3.9 (transformer_1)" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="GOOGLE" />
+    <option name="myDocStringFormat" value="Google" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="py.test" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000000..3295bcdab3
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (transformer_1)" project-jdk-type="Python SDK" />
+  <component name="PythonCompatibilityInspectionAdvertiser">
+    <option name="version" value="3" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
new file mode 100644
index 0000000000..e7025ec2d0
--- /dev/null
+++ b/.idea/workspace.xml
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="ce2c20e4-283d-4a9b-8c44-a1e3f3041d83" name="Changes" comment="">
+      <change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.gitlab-ci.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.gitlab-ci.yml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.gitmodules" beforeDir="false" afterPath="$PROJECT_DIR$/.gitmodules" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/CONTRIBUTING.md" beforeDir="false" afterPath="$PROJECT_DIR$/CONTRIBUTING.md" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/Jenkinsfile" beforeDir="false" afterPath="$PROJECT_DIR$/Jenkinsfile" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/LICENSE" beforeDir="false" afterPath="$PROJECT_DIR$/LICENSE" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/MANIFEST.in" beforeDir="false" afterPath="$PROJECT_DIR$/MANIFEST.in" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/contrib/garnet.py" beforeDir="false" afterPath="$PROJECT_DIR$/contrib/garnet.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/docs/Makefile" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/api/configuration.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/api/configuration.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/api/hls-model.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/api/hls-model.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/api/profiling.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/api/profiling.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/command.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/command.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/concepts.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/concepts.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/conf.py" beforeDir="false" afterPath="$PROJECT_DIR$/docs/conf.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/img/hls4ml_logo.svg" beforeDir="false" afterPath="$PROJECT_DIR$/docs/img/hls4ml_logo.svg" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/img/hls4ml_logo_lightgrey.svg" beforeDir="false" afterPath="$PROJECT_DIR$/docs/img/hls4ml_logo_lightgrey.svg" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/index.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/index.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/reference.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/reference.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/release_notes.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/release_notes.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/setup.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/setup.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/docs/status.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/status.rst" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/build_prj.tcl" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/myproject.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/myproject.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/parameters.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/b1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/b2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/w1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/w2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/myproject.tcl" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/myproject_test.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/build_prj.tcl" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/myproject.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/myproject.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/parameters.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/b1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/b2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/w1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/w2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/myproject.tcl" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/myproject_test.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/demo-conversion/README.md" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/demo-conversion/keras-config.yml" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/build_prj.tcl" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/myproject.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/myproject.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/parameters.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/b1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/b2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/w1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/w2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/myproject.tcl" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/myproject_test.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/build_prj.tcl" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/myproject.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/myproject.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/parameters.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b2_0.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b2_1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b3.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b4.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w2.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w2_0.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w2_1.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w3.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w4.h" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/myproject_test.cpp" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/hls4ml/converters/keras/qkeras_layers.py" beforeDir="false" afterPath="$PROJECT_DIR$/hls4ml/converters/keras/qkeras_layers.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/hls4ml/model/layers.py" beforeDir="false" afterPath="$PROJECT_DIR$/hls4ml/model/layers.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/hs_err_pid6927.log" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/scripts/hls4ml" beforeDir="false" afterPath="$PROJECT_DIR$/scripts/hls4ml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/setup.cfg" beforeDir="false" afterPath="$PROJECT_DIR$/setup.cfg" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/setup.py" beforeDir="false" afterPath="$PROJECT_DIR$/setup.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/build-prj.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/build-prj.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/cleanup.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/cleanup.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/compare-reports.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/compare-reports.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/convert-keras-models.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/convert-keras-models.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/convert-onnx-models.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/convert-onnx-models.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/convert-pytorch-models.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/convert-pytorch-models.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/docker/Dockerfile" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/Dockerfile" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/docker/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/README.md" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/docker/install_config-2017.2.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/install_config-2017.2.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/docker/install_config.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/install_config.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/gather-reports.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/gather-reports.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/hls4ml-keras-test.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/hls4ml-keras-test.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/hls4ml-onnx-test.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/hls4ml-onnx-test.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/hls4ml-pytorch-test.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/hls4ml-pytorch-test.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/keras-models.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/keras-models.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/keras-to-hls.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/keras-to-hls.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/onnx-models.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/onnx-models.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/onnx-to-hls.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/onnx-to-hls.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/ci-template.yml" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/ci-template.yml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/generate_ci_yaml.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/generate_ci_yaml.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_activations.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_activations.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_batchnorm.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_batchnorm.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_bram_factor.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_bram_factor.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_cnn_mnist.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_cnn_mnist.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_cnn_mnist_qkeras.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_cnn_mnist_qkeras.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_conv1d.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_conv1d.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_embed.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_embed.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_extensions.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_extensions.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_flows.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_flows.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_garnet.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_garnet.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_globalpooling.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_globalpooling.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_graph.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_graph.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_keras_api.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_keras_api.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_keras_h5_loader.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_keras_h5_loader.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_merge.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_merge.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_pointwiseconv.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_pointwiseconv.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_qkeras.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_qkeras.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_rnn.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_rnn.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_sepconv2d.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_sepconv2d.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_softmax.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_softmax.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_softsign.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_softsign.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_trace.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_trace.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_transpose_concat.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_transpose_concat.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_upsampling.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_upsampling.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytest/test_zeropadding.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_zeropadding.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytorch-models.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytorch-models.txt" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/test/pytorch-to-hls.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytorch-to-hls.sh" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/vivado.jou" beforeDir="false" />
+      <change beforePath="$PROJECT_DIR$/vivado.log" beforeDir="false" />
+    </list>
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="ProjectId" id="2PtfBHVXd9J95mq9Ogv8OfyMXTT" />
+  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent">
+    <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
+    <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$/hls4ml" />
+    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="ce2c20e4-283d-4a9b-8c44-a1e3f3041d83" name="Changes" comment="" />
+      <created>1684281161699</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1684281161699</updated>
+    </task>
+    <servers />
+  </component>
+</project>
\ No newline at end of file
diff --git a/hls4ml/.idea/.gitignore b/hls4ml/.idea/.gitignore
new file mode 100644
index 0000000000..eaf91e2ac6
--- /dev/null
+++ b/hls4ml/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/hls4ml/.idea/hls4ml.iml b/hls4ml/.idea/hls4ml.iml
new file mode 100644
index 0000000000..435d23406d
--- /dev/null
+++ b/hls4ml/.idea/hls4ml.iml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="transformer_1" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
diff --git a/hls4ml/.idea/inspectionProfiles/profiles_settings.xml b/hls4ml/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000000..105ce2da2d
--- /dev/null
+++ b/hls4ml/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
diff --git a/hls4ml/.idea/misc.xml b/hls4ml/.idea/misc.xml
new file mode 100644
index 0000000000..c6af3c0bae
--- /dev/null
+++ b/hls4ml/.idea/misc.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="transformer_1" project-jdk-type="Python SDK" />
+  <component name="PyPackaging">
+    <option name="earlyReleasesAsUpgrades" value="true" />
+  </component>
+  <component name="PythonCompatibilityInspectionAdvertiser">
+    <option name="version" value="3" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/hls4ml/.idea/modules.xml b/hls4ml/.idea/modules.xml
new file mode 100644
index 0000000000..7cbe9d42dd
--- /dev/null
+++ b/hls4ml/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/hls4ml.iml" filepath="$PROJECT_DIR$/.idea/hls4ml.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
diff --git a/hls4ml/.idea/vcs.xml b/hls4ml/.idea/vcs.xml
new file mode 100644
index 0000000000..2e3f6920d0
--- /dev/null
+++ b/hls4ml/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
diff --git a/hls4ml/converters/keras/qkeras_layers.py b/hls4ml/converters/keras/qkeras_layers.py
new file mode 100644
index 0000000000..6a3e3e062e
--- /dev/null
+++ b/hls4ml/converters/keras/qkeras_layers.py
@@ -0,0 +1,143 @@
+from hls4ml.converters.keras_to_hls import parse_default_keras_layer
+from hls4ml.converters.keras_to_hls import keras_handler
+
+from hls4ml.converters.keras.core import parse_dense_layer
+from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_layernorm_layer
+from hls4ml.converters.keras.convolution import parse_conv1d_layer
+from hls4ml.converters.keras.convolution import parse_conv2d_layer
+from hls4ml.converters.keras.qkeras import *
+from hls4ml.converters.keras.multiheadattention import parse_mutiheadattention_layer
+
+import tensorflow as tf
+
+
+@keras_handler('QDense')
+def parse_qdense_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    layer, output_shape = parse_dense_layer(keras_layer, input_names, input_shapes, data_reader, config)
+
+    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
+    if keras_layer['config']['bias_quantizer'] is not None:
+        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
+    else:
+        layer['bias_quantizer'] = None
+
+    return layer, output_shape
+
+
+@keras_handler('QMultiHeadAttention')
+def parse_qmultiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    assert('QMultiHeadAttention' in keras_layer['class_name'])
+    assert (input_shapes[0] == keras_layer['config']['query_shape'])
+
+    layer, output_shape = parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config)
+
+    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
+    if keras_layer['config']['bias_quantizer'] is not None:
+        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
+    else:
+        layer['bias_quantizer'] = None
+
+    return layer, output_shape
+
+@keras_handler('QConv1D', 'QConv2D')
+def parse_qconv_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    assert ('QConv' in keras_layer['class_name'])
+
+    if '1D' in keras_layer['class_name']:
+        layer, output_shape = parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader, config)
+    elif '2D' in keras_layer['class_name']:
+        layer, output_shape = parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader, config)
+
+    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
+    if keras_layer['config']['bias_quantizer'] is not None:
+        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
+    else:
+        layer['bias_quantizer'] = None
+
+    return layer, output_shape
+
+
+@keras_handler('QActivation')
+def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    assert (keras_layer['class_name'] == 'QActivation')
+    supported_activations = ['quantized_relu', 'quantized_tanh', 'binary_tanh', 'ternary_tanh', 'quantized_bits',
+                             'quantized_softmax', 'binary', 'ternary']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    activation_config = keras_layer['config']['activation']
+    quantizer_obj = get_quantizer(activation_config)
+    activation_config = {}
+    # some activations are classes 
+    if hasattr(quantizer_obj, 'get_config'):
+        activation_config['class_name'] = quantizer_obj.__class__.__name__
+        if activation_config['class_name'] == 'ternary' or activation_config['class_name'] == 'binary':
+            activation_config['class_name'] += '_tanh'
+        activation_config['config'] = quantizer_obj.get_config()
+    # some activation quantizers are just functions with no config
+    else:
+        activation_config['config'] = {}
+        if 'binary' in quantizer_obj.__name__:
+            activation_config['class_name'] = 'binary_tanh'
+            activation_config['config']['bits'] = 1
+            activation_config['config']['integer'] = 1
+        elif 'ternary' in quantizer_obj.__name__:
+            activation_config['class_name'] = 'ternary_tanh'
+            activation_config['config']['bits'] = 2
+            activation_config['config']['integer'] = 2
+        else:
+            activation_config['class_name'] = 'unknown'
+
+    if activation_config['class_name'] not in supported_activations:
+        raise Exception('Unsupported QKeras activation: {}'.format(activation_config['class_name']))
+
+    if activation_config['class_name'] == 'ternary_tanh':
+        layer['class_name'] = 'TernaryTanh'
+        layer['threshold'] = activation_config.get('config', {}).get('threshold', 0.33)
+        if layer['threshold'] is None:
+            layer['threshold'] = 0.33  # the default ternary tanh threshold for QKeras
+    else:
+        layer['class_name'] = 'Activation'
+    if activation_config['class_name'] == 'quantized_bits':
+        activation_config['class_name'] = 'linear'
+    if activation_config['class_name'] == 'quantized_softmax':
+        # activation_config['class_name'] = 'softmax'
+        layer['class_name'] = 'Softmax'
+        layer['axis'] = keras_layer['config'].get('axis', -1)
+    layer['activation'] = activation_config['class_name'].replace('quantized_', '')
+    return layer, [shape for shape in input_shapes[0]]
+
+
+@keras_handler('QBatchNormalization')
+def parse_qbatchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    layer, output_shape = parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config)
+
+    layer['mean_quantizer'] = get_quantizer_from_config(keras_layer, 'mean')
+    layer['variance_quantizer'] = get_quantizer_from_config(keras_layer, 'variance')
+    layer['beta_quantizer'] = get_quantizer_from_config(keras_layer, 'beta')
+    layer['gamma_quantizer'] = get_quantizer_from_config(keras_layer, 'gamma')
+
+    return layer, output_shape
+
+
+@keras_handler('QLayerNormalization')
+def parse_qlayernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    layer, output_shape = parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config)
+
+    layer['mean_quantizer'] = get_quantizer_from_config(keras_layer, 'mean')
+    layer['variance_quantizer'] = get_quantizer_from_config(keras_layer, 'variance')
+    layer['beta_quantizer'] = get_quantizer_from_config(keras_layer, 'beta')
+    layer['gamma_quantizer'] = get_quantizer_from_config(keras_layer, 'gamma')
+
+    return layer, output_shape
+
+
+@keras_handler('QConv2DBatchnorm')
+def parse_qconv2dbatchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    intermediate_shape = list()
+    conv_layer, shape_qconv = parse_qconv_layer(keras_layer, input_names, input_shapes, data_reader, config)
+    intermediate_shape.append(shape_qconv)
+    temp_shape = intermediate_shape
+    batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader, config)
+    return {**conv_layer, **batch_layer}, out_shape
+
diff --git a/hls4ml/converters/tf_to_hls.py b/hls4ml/converters/tf_to_hls.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/model/optimizer/passes/precision_merge.py b/hls4ml/model/optimizer/passes/precision_merge.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/hls4ml/templates/quartus/firmware/defines.h.bak b/hls4ml/templates/quartus/firmware/defines.h.bak
new file mode 100644
index 0000000000..49781dc963
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/defines.h.bak
@@ -0,0 +1,47 @@
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+/*
+ * Intel HLS makes use of three streaming interfaces:
+ *   (1) stream_in - used as the main input to a component
+ *   (2) stream_out - used as the main output of a component
+ *   (3) stream - allows both reading and writing; used for inter-component connections
+ * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
+ * Therefore, variables of type 'stream' are always passed by reference
+ */
+
+#ifndef __INTELFPGA_COMPILER__
+
+#include "ac_fixed.h"
+#include "ac_int.h"
+#define hls_register
+
+#include "stream.h"
+template <typename T> using stream = nnet::stream<T>;
+template <typename T> using stream_in = nnet::stream<T>;
+template <typename T> using stream_out = nnet::stream<T>;
+
+#else
+
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
+
+template <typename T> using stream = ihc::stream<T>;
+template <typename T> using stream_in = ihc::stream_in<T>;
+template <typename T> using stream_out = ihc::stream_out<T>;
+
+#endif
+
+// Include nnet::array - a custom array-like struct, mainly used with io_stream
+#include "nnet_utils/nnet_types.h"
+
+// hls-fpga-machine-learning insert numbers
+
+// hls-fpga-machine-learning insert layer-precision
+
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n < d ? d : n)
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/myproject.cpp.bak b/hls4ml/templates/quartus/firmware/myproject.cpp.bak
new file mode 100644
index 0000000000..3f5749d611
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/myproject.cpp.bak
@@ -0,0 +1,48 @@
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert weights
+
+/*
+ * Intel HLS requires that all 'stream' types are:
+ *     (1) Passed by reference to the top-level entity or
+ *     (2) Declared as global variables, outside of the main function
+ * Therefore, layer inputs/output (connections betweenn individual layers) are declared here
+ */
+// hls-fpga-machine-learning insert inter-task streams
+
+#ifndef __INTELFPGA_COMPILER__
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+// hls-fpga-machine-learning instantiate GCC top-level
+#else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
+// hls-fpga-machine-learning insert cpragmas
+
+/*
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
+#endif
+// If using io_parallel, the output needs to be initialised and returned at the end of this function
+// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
+// hls-fpga-machine-learning initialize input/output
+
+// ****************************************
+// NETWORK INSTANTIATION
+// ****************************************
+
+// hls-fpga-machine-learning insert layers
+
+// hls-fpga-machine-learning return
diff --git a/hls4ml/templates/quartus/firmware/myproject.h.bak b/hls4ml/templates/quartus/firmware/myproject.h.bak
new file mode 100644
index 0000000000..afb7020671
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/myproject.h.bak
@@ -0,0 +1,48 @@
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#ifndef __INTELFPGA_COMPILER__
+#include "ac_fixed.h"
+#include "ac_int.h"
+#define hls_register
+#else
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
+#endif
+
+// Streams are explicitly defined in defines.h, which are included for parameters.h
+// Defining them again in this file will cause compile-time errors
+#include "defines.h"
+
+// If using io_parallel, inputs and output need to be initialised before calling the top-level function
+// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
+// hls-fpga-machine-learning insert inputs
+// hls-fpga-machine-learning insert outputs
+
+#ifndef __INTELFPGA_COMPILER__
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+// hls-fpga-machine-learning instantiate GCC top-level
+#else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
+// hls-fpga-machine-learning insert cpragmas
+
+/*
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
+#endif
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak
new file mode 100644
index 0000000000..f8c4ae7c64
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak
@@ -0,0 +1,104 @@
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+// Calcuate result
+Result:
+    //#pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
+                           const data_T threshold[CONFIG_T::n_scale_bias]) {
+    //#pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<1, false> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
+                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
+                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    //#pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<2, true> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak
new file mode 100644
index 0000000000..6973e51a76
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak
@@ -0,0 +1,71 @@
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#ifndef __INTELFPGA_COMPILER__
+#include "ac_fixed.h"
+#include "ac_int.h"
+#include "math.h"
+#else
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/math.h"
+#endif
+
+#include "nnet_helpers.h"
+
+typedef ac_fixed<16, 6> table_default_t;
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+
+// Default data types (??) TODO: Deprecate
+typedef ac_fixed<16, 4> weight_t_def;
+typedef ac_fixed<16, 4> bias_t_def;
+typedef ac_fixed<32, 10> accum_t_def;
+
+template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
+    //#pragma unroll
+    for (int ii = 0; ii < NIN1; ii++) {
+        res[ii] = data1[ii];
+    }
+    //#pragma unroll
+    for (int ii = 0; ii < NIN2; ii++) {
+        res[NIN1 + ii] = data2[ii];
+    }
+}
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Quartus cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak
new file mode 100644
index 0000000000..579606519f
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak
@@ -0,0 +1,64 @@
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+
+namespace nnet {
+
+struct conv1d_config {
+    // I/O sizes
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+
+    // Number of channels, filters
+    static const unsigned n_chan = 1;
+    static const unsigned n_filt = 1;
+
+    // Original filter size
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+
+    // Modified filter size (post-Wionograd transformation, if applied)
+    static const unsigned impl_filt_height = 1;
+    static const unsigned impl_filt_width = 1;
+
+    // Padding, stride, dilation
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+
+    // Run-time Configuration
+    static const unsigned n_zeros = 0;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelisation_factor = 1;
+
+    // TODO: BRAM Storage on Quartus
+    static const bool store_weights_in_bram = false;
+
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak
new file mode 100644
index 0000000000..99187814ec
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak
@@ -0,0 +1,169 @@
+#ifndef NNET_DENSE_LARGE_H_
+#define NNET_DENSE_LARGE_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    static const unsigned reuse_factor = 1;
+    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
+    static const unsigned multiplier_factor = 1; // min n_in, rf
+    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
+    static const unsigned reciprocal = 1;        // 2^35 / 25
+    static const unsigned rf_pad = 0;
+    static const unsigned bf_pad = 0;
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_gt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
+    ////#pragma ii CONFIG_T::reuse_factor
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+Load:
+    //#pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+
+    //#pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + CONFIG_T::reuse_factor * im;
+            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor).to_int();
+            d_index[ir][im] = w_index % CONFIG_T::n_in;
+        }
+    }
+Product1:
+    //#pragma nofusion
+    //#pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
+    Product2:
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
+                continue;
+            int data_index = d_index[ir][im];
+            // Modified this
+            tmp_acc[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
+        }
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
+    ResetMult:
+        //#pragma unroll
+        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
+            mult[imult] = 0;
+        }
+    AccumLoop1:
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            int o_index = out_index[ir][im];
+            if (o_index >= CONFIG_T::n_out)
+                continue; // check out of bounds
+            mult[o_index] += tmp_acc[im];
+        }
+    AccumLoop2:
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
+            acc[im] += mult[im];
+        }
+    }
+Store:
+    //#pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_lt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+InitAccum:
+    //#pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+ReuseLoop:
+    //#pragma nofusion
+    //#pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
+    MultLoop:
+        //#pragma unroll
+        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue;
+            // Modified this
+            mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+            in_index += CONFIG_T::reuse_factor;
+            if (in_index >= CONFIG_T::n_in)
+                in_index = ir;
+        }
+    AccumLoop:
+        //#pragma unroll
+        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
+            acc[out_index] += mult[im];
+            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+// Cast to "res_t" type
+Result:
+    //#pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(
+    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak
new file mode 100644
index 0000000000..dcda87d316
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak
@@ -0,0 +1,80 @@
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+InitAccum:
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_out; i++) {
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+
+    //#pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32 w = ir + CONFIG_T::reuse_factor * im;
+            inputs[ir][im] = data[weights[w].row_index];
+            out_index[ir][im] = weights[w].col_index;
+        }
+    }
+ReuseLoop:
+    //#pragma nofusion
+    //#pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
+    CompressedMultLoop:
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32 w = ir + CONFIG_T::reuse_factor * im;
+            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
+            typename CONFIG_T::accum_t prod = mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
+            //#pragma unroll
+            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
+                inputs[is][im] = inputs[is + 1][im];
+            }
+        }
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
+    ResetMult:
+        //#pragma unroll
+        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
+            tmp_acc[tacc] = 0;
+        }
+    AccumLoop1:
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            int col = out_index[ir][im];
+            tmp_acc[col] += mult[im];
+        }
+    AccumLoop2:
+        //#pragma unroll
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += tmp_acc[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    //#pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak
new file mode 100644
index 0000000000..775303e267
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak
@@ -0,0 +1,140 @@
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace nnet {
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = static_cast<dstType>(src[i].to_double());
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, stream_in<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data_back(stream_out<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j].to_double());
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = static_cast<save_T>(data[i].to_double());
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(stream<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = static_cast<save_T>(ctype[j].to_double());
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << data[i] << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(stream<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << ctype[j] << " ";
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak
new file mode 100644
index 0000000000..b24f56dc18
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak
@@ -0,0 +1,249 @@
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+
+    static const unsigned reuse_factor = 1;
+
+    typedef float accum_t;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] + data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] - data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] * data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+
+    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+Product:
+    //#pragma unroll multiplier_limit
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
+    }
+
+    hls_register typename CONFIG_T::accum_t acc = 0;
+Accum:
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        acc += mult[i];
+    }
+
+    res[0] = static_cast<res_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        //#pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
+                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
+        }
+
+        //#pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
+                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    //#pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx =
+                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+        }
+
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
+                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak
new file mode 100644
index 0000000000..085fabf99f
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak
@@ -0,0 +1,113 @@
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+//  Different methods to perform the product of input and weight, depending on their types.
+namespace product {
+
+class Product {
+  public:
+    static void limit(unsigned multiplier_limit) {}
+};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    inline static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        return a * w;
+    }
+    static void limit(unsigned multiplier_limit) {
+        // TODO: Implement for Quartus
+        // //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
+        // pragma
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
+    inline static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+} // namespace product
+
+// TO-DO: These may need extra variants if ac_int types are used in more places
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak
new file mode 100644
index 0000000000..7e3fa9e55a
--- /dev/null
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak
@@ -0,0 +1,99 @@
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        //#pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        //#pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        //#pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            //#pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/myproject_test_parallel.cpp.bak b/hls4ml/templates/quartus/myproject_test_parallel.cpp.bak
new file mode 100644
index 0000000000..4de819eb49
--- /dev/null
+++ b/hls4ml/templates/quartus/myproject_test_parallel.cpp.bak
@@ -0,0 +1,112 @@
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/parameters.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+// This function is written to avoid stringstream, which is
+// not supported in cosim 20.1, and because strtok
+// requires a const_cast or allocation to use with std::strings.
+// This function returns the next float (by argument) at position pos,
+// updating pos. True is returned if conversion done, false if the string
+// has ended, and std::invalid_argument exception if the sting was bad.
+bool nextToken(const std::string &str, std::size_t &pos, float &val) {
+    while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
+        pos++;
+    }
+    if (pos >= str.size()) {
+        return false;
+    }
+    std::size_t offset = 0;
+    val = std::stof(str.substr(pos), &offset);
+    pos += offset;
+    return true;
+}
+
+int main(int argc, char **argv) {
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+    std::string RESULTS_LOG = "tb_data/results.log";
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+
+    std::vector<input_data> inputs;
+    std::vector<output_data> outputs;
+
+    if (fin.is_open() && fpr.is_open()) {
+        std::vector<std::vector<float>> predictions;
+        unsigned int num_iterations = 0;
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
+            if (num_iterations % CHECKPOINT == 0) {
+                std::cout << "Processing input " << num_iterations << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::size_t pos = 0;
+            while (nextToken(iline, pos, current)) {
+                in.push_back(current);
+            }
+
+            pos = 0;
+            while (nextToken(pline, pos, current)) {
+                pr.push_back(current);
+            }
+
+            // hls-fpga-machine-learning insert data
+            predictions.push_back(std::move(pr));
+        }
+
+        // Do this separately to avoid vector reallocation
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert run
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert tb-output
+            if (j % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        const unsigned int num_iterations = 10;
+        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                  << " invocations." << std::endl;
+        // hls-fpga-machine-learning insert zero
+
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert run
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert output
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/vivado/firmware/myproject.cpp.bak b/hls4ml/templates/vivado/firmware/myproject.cpp.bak
new file mode 100644
index 0000000000..74b58c5cb1
--- /dev/null
+++ b/hls4ml/templates/vivado/firmware/myproject.cpp.bak
@@ -0,0 +1,23 @@
+#include <iostream>
+
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+void myproject(
+    // hls-fpga-machine-learning insert header
+) {
+
+    // hls-fpga-machine-learning insert IO
+
+    // hls-fpga-machine-learning insert load weights
+
+    // ****************************************
+    // NETWORK INSTANTIATION
+    // ****************************************
+
+    // hls-fpga-machine-learning insert layers
+}
+
+// hls-fpga-machine-learning insert namespace-end
diff --git a/hls4ml/templates/vivado/firmware/myproject.h.bak b/hls4ml/templates/vivado/firmware/myproject.h.bak
new file mode 100644
index 0000000000..a56778976b
--- /dev/null
+++ b/hls4ml/templates/vivado/firmware/myproject.h.bak
@@ -0,0 +1,19 @@
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#include "ap_fixed.h"
+#include "ap_int.h"
+#include "hls_stream.h"
+
+#include "defines.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+// Prototype of top level function for C-synthesis
+void myproject(
+    // hls-fpga-machine-learning insert header
+);
+
+// hls-fpga-machine-learning insert namespace-end
+
+#endif
diff --git a/hls4ml/templates/vivado/myproject_test.cpp.bak b/hls4ml/templates/vivado/myproject_test.cpp.bak
new file mode 100644
index 0000000000..29a4c816e5
--- /dev/null
+++ b/hls4ml/templates/vivado/myproject_test.cpp.bak
@@ -0,0 +1,94 @@
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+int main(int argc, char **argv) {
+    // hls-fpga-machine-learning insert namespace
+
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+#ifdef RTL_SIM
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+#else
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
+#endif
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+
+            // hls-fpga-machine-learning insert data
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+
+        // hls-fpga-machine-learning insert zero
+
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert output
+
+        // hls-fpga-machine-learning insert tb-output
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak
new file mode 100644
index 0000000000..1dc96e50a0
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak
@@ -0,0 +1,795 @@
+#ifndef NNET_ACTIVATION_H_
+#define NNET_ACTIVATION_H_
+
+#include "ap_fixed.h"
+#include "nnet_common.h"
+#include <cmath>
+
+namespace nnet {
+
+struct activ_config {
+    // IO size
+    static const unsigned n_in = 10;
+
+    // Internal info
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef ap_fixed<18, 8> table_t;
+};
+
+// *************************************************
+//       LINEAR Activation -- See Issue 53
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = data[ii];
+    }
+}
+
+// *************************************************
+//       RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
+void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg < 0)
+            res[ii] = 0;
+        else if (datareg > MAX_INT)
+            res[ii] = MAX_INT;
+        else
+            res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
+}
+
+// *************************************************
+//       Sigmoid Activation
+// *************************************************
+inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
+
+template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default logistic sigmoid function:
+    //   result = 1/(1+e^(-x))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)sigmoid_table[index];
+    }
+}
+
+// *************************************************
+//       Softmax Activation
+// *************************************************
+
+enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
+
+inline float exp_fcn_float(float input) { return std::exp(input); }
+
+template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
+    // Treat the index as the top N bits
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    data_T x(0);
+    x(x.width - 1, x.width - N) = i;
+    return (float)x;
+}
+
+template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
+    ap_uint<N> y = x(x.width - 1, x.width - N);              // slice the top N bits of input
+    return (unsigned)y(N - 1, 0);
+}
+
+template <class data_T, typename CONFIG_T>
+void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        // Slicing bits for address is going to round towards 0, so take the central value
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
+        table_out[i] = exp_x;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
+    // The template data_T is the data type used to address the table
+    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
+        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
+        typename CONFIG_T::inv_table_t inv_x = 1.0 / x;
+        table_out[i] = inv_x;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    //#pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS pipeline
+    // Initialize the lookup tables
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+
+#endif
+    if (!initialized) {
+        // Note we are exponentiating the inputs, which have type data_T
+        init_exp_table<data_T, CONFIG_T>(exp_table);
+        // Note we are inverting the exponentials, which have type exp_table_t
+        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
+        initialized = true;
+    }
+
+    // Find the max and compute all delta(x_i, x_max)
+    Op_max<data_T> op_max;
+    data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
+
+    // For the diffs, use the same type as the input but force rounding and saturation
+    ap_fixed<data_T::width, data_T::iwidth, AP_RND, AP_SAT> d_xi_xmax[CONFIG_T::n_in];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        d_xi_xmax[i] = data[i] - x_max;
+    }
+
+    // Calculate all the e^x's
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
+    //#pragma HLS array_partition variable=exp_res complete
+    typename CONFIG_T::exp_table_t exp_sum(0);
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[x];
+    }
+
+    // Explicitly sum the results with an adder tree.
+    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
+    Op_add<typename CONFIG_T::exp_table_t> op_add;
+    exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+
+    typename CONFIG_T::inv_table_t inv_exp_sum =
+        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
+        //#pragma HLS unroll
+        res[i] = exp_res[i] * inv_exp_sum;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::exp_table_t table_out[N_TABLE]) {
+    float exp_range = (float)CONFIG_T::exp_range;
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::inv_table_t table_out[N_TABLE]) {
+    float inv_range = (float)CONFIG_T::inv_range;
+    // Inversion function:
+    //   result = 1/x
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        float in_val = inv_range * ii / float(N_TABLE);
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / in_val;
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS pipeline
+    int exp_range = CONFIG_T::exp_range;
+    int inv_range = CONFIG_T::inv_range;
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
+        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
+        initialized = true;
+    }
+
+    // Index into the lookup table based on data for exponentials
+    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
+    typename CONFIG_T::exp_table_t exp_diff_res;            // different, independent, fixed point precision
+    typename CONFIG_T::exp_table_t data_cache[CONFIG_T::n_in];
+    int data_round;
+    int index;
+
+    //    std::cout << "input to SM: " << std::endl;              /////
+    //    nnet::print_result<data_T, CONFIG_T::n_in>(data, std::cout);  /////
+    //    std::cout << " " << std::endl;   /////
+
+    //#pragma HLS array_partition variable=data_cache complete
+
+    typename CONFIG_T::accum_t denominator;
+    typename CONFIG_T::inv_table_t deno_inver;
+
+    denominator = 0;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * (CONFIG_T::table_size / (exp_range * 2));
+        // std::cout << " data, round: " << data[ii] << " " << data_round << std::endl;  /////
+        index = data_round + exp_range * (CONFIG_T::table_size / (exp_range * 2));
+        // std::cout << " index: " << index;   /////
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        denominator += exp_table[index];
+        // std::cout << "   denominator " << index << std::endl;   /////
+        // std::cout << "   denominator " << denominator << std::endl;   /////
+        data_cache[ii] = exp_table[index];
+    }
+    // std::cout << "end  " << std::endl;    /////
+
+    // using lookup table for inverse
+    int exp_res_index = denominator * (CONFIG_T::table_size / inv_range);
+
+    // std::cout << " denominator: " << denominator << std::endl;  /////
+    // std::cout << " table_size: " << CONFIG_T::table_size << std::endl;  /////
+    // std::cout << " inv_range: " << inv_range << std::endl;  /////
+    // std::cout << " exp_res_index: " << exp_res_index << std::endl;  /////
+    if (exp_res_index < 0)
+        exp_res_index = 0;
+    if (exp_res_index > CONFIG_T::table_size - 1)
+        exp_res_index = CONFIG_T::table_size - 1;
+    deno_inver = invert_table[exp_res_index];
+    // std::cout << " deno_inver: " << deno_inver << std::endl;  /////
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        res[ii] = (res_T)(data_cache[ii] * deno_inver);
+    }
+
+    //	std::cout << "out SM: " << std::endl;
+    //    nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
+    //    std::cout << " " << std::endl;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS inline
+    switch (CONFIG_T::implementation) {
+    case softmax_implementation::latency:
+        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::stable:
+        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::legacy:
+        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    case softmax_implementation::argmax:
+        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
+        break;
+    }
+}
+
+// *************************************************
+//       TanH Activation
+// *************************************************
+template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Implement tanh lookup
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
+        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = tanh(in_val);
+        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
+        // std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 8;
+        index = data_round + 4 * CONFIG_T::table_size / 8;
+        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)tanh_table[index];
+    }
+}
+
+// *************************************************
+//       UnaryLUT Activation
+// *************************************************
+template <int table_size, class data_T> inline unsigned get_index_unary_lut(data_T x) {
+    // Slice the top N bits to get an index into the table
+    static constexpr int N = ceillog2(table_size);
+    return (unsigned)(x(x.width - 1, 0));
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void unary_lut(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::table_t table[CONFIG_T::table_size]) {
+    //#pragma HLS function_instantiate variable=table
+    //#pragma HLS ARRAY_PARTITION variable=table
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        //#pragma HLS UNROLL
+        unsigned index = get_index_unary_lut<CONFIG_T::table_size>(data[ii]);
+        res[ii] = (res_T)table[index];
+    }
+}
+
+// *************************************************
+//       Hard sigmoid Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (datareg > 1)
+            datareg = 1;
+        else if (datareg < 0)
+            datareg = 0;
+        res[ii] = datareg;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    if (CONFIG_T::io_type == io_parallel) {
+        //#pragma HLS PIPELINE
+    }
+
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
+        if (sigmoid > 1)
+            sigmoid = 1;
+        else if (sigmoid < 0)
+            sigmoid = 0;
+        res[ii] = 2 * sigmoid - 1;
+    }
+}
+
+// *************************************************
+//       Leaky RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha * datareg;
+    }
+}
+
+// *************************************************
+//       Thresholded RELU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > theta)
+            res[ii] = datareg;
+        else
+            res[ii] = 0;
+    }
+}
+
+// *************************************************
+//       Softplus Activation
+// *************************************************
+inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softplus function:
+    //   result = log(exp(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softplus_table[index];
+    }
+}
+
+// *************************************************
+//       Softsign Activation
+// *************************************************
+inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
+
+template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default softsign function:
+    //   result = x / (abs(x) + 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
+        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    // Index into the lookup table based on data
+    int data_round;
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        data_round = data[ii] * CONFIG_T::table_size / 16;
+        index = data_round + 8 * CONFIG_T::table_size / 16;
+        if (index < 0)
+            index = 0;
+        if (index > CONFIG_T::table_size - 1)
+            index = CONFIG_T::table_size - 1;
+        res[ii] = (res_T)softsign_table[index];
+    }
+}
+
+// *************************************************
+//       ELU Activation
+// *************************************************
+inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
+
+template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default ELU function:
+    //   result = alpha * (e^(x) - 1)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = alpha * elu_table[index];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
+}
+
+// *************************************************
+//       SELU Activation
+// *************************************************
+inline float selu_fcn_float(float input) {
+    return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    // Default SELU function:
+    //   result = 1.05 * (1.673 * (e^(x) - 1))
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
+        float in_val = -8.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
+        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
+        table_out[ii] = real_val;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    // Initialize the lookup table
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
+        initialized = true;
+    }
+
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    // Index into the lookup table based on data
+    int index;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg >= 0) {
+            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
+        } else {
+            index = datareg * CONFIG_T::table_size / -8;
+            if (index > CONFIG_T::table_size - 1)
+                index = CONFIG_T::table_size - 1;
+            res[ii] = selu_table[index];
+        }
+    }
+}
+
+// *************************************************
+//       PReLU Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            res[ii] = datareg;
+        else
+            res[ii] = alpha[ii] * datareg;
+    }
+}
+
+// *************************************************
+//       Binary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        if (datareg > 0)
+            cache = 1;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+// *************************************************
+//       Ternary TanH Activation
+// *************************************************
+template <class data_T, class res_T, typename CONFIG_T>
+void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    //#pragma HLS PIPELINE
+
+    data_T datareg;
+    res_T cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = 2 * data[ii];
+        if (datareg > 1)
+            cache = 1;
+        else if (datareg > -1 && datareg <= 1)
+            cache = 0;
+        else
+            cache = -1;
+
+        res[ii] = (res_T)cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak
new file mode 100644
index 0000000000..843f303057
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak
@@ -0,0 +1,52 @@
+#ifndef NNET_ARRAY_H_
+#define NNET_ARRAY_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned depth = 10;
+    static constexpr unsigned perm[3] = {2, 0, 1};
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
+    //#pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::height; i++) {
+        for (int j = 0; j < CONFIG_T::width; j++) {
+            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
+                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
+    unsigned dims_t[3];
+    dims_t[0] = dims[CONFIG_T::perm[0]];
+    dims_t[1] = dims[CONFIG_T::perm[1]];
+    dims_t[2] = dims[CONFIG_T::perm[2]];
+
+    int idx[3] = {0}, idx_t[3] = {0};
+    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
+        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
+            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
+                idx_t[0] = idx[CONFIG_T::perm[0]];
+                idx_t[1] = idx[CONFIG_T::perm[1]];
+                idx_t[2] = idx[CONFIG_T::perm[2]];
+
+                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
+                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak
new file mode 100644
index 0000000000..a4e4441311
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak
@@ -0,0 +1,124 @@
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    data_T cache;
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=scale,bias
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    //#pragma HLS ARRAY_PARTITION variable=scale complete
+    //#pragma HLS ARRAY_PARTITION variable=bias complete
+
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Calcuate result
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in],
+                           data_T threshold[CONFIG_T::n_scale_bias]) {
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_uint<1> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in],
+                            data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    //#pragma HLS PIPELINE
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_int<2> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak
new file mode 100644
index 0000000000..21514e3c79
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak
@@ -0,0 +1,123 @@
+#ifndef NNET_BATCHNORM_STREAM_H_
+#define NNET_BATCHNORM_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(hls::stream<data_T> &data, hls::stream<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    //#pragma HLS ARRAY_PARTITION variable=scale complete
+    //#pragma HLS ARRAY_PARTITION variable=bias complete
+
+    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+BatchNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE II=ii
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormpack:
+        for (int j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                              in_data[j], scale[norm_index]) +
+                          bias[norm_index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias>> &res,
+                           typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
+    //#pragma HLS ARRAY_PARTITION variable=threshold complete
+
+BinaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_int<2>, CONFIG_T::n_scale_bias>> &res,
+                            typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
+                            typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+    //#pragma HLS ARRAY_PARTITION variable=threshold_hi complete
+    //#pragma HLS ARRAY_PARTITION variable=threshold_lo complete
+
+TernaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_int<2>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            //#pragma HLS UNROLL
+
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+
+            if (in_data[j] > threshold_hi[norm_index]) {
+                out_data[j] = 1;
+            } else if (in_data[j] <= threshold_lo[norm_index]) {
+                out_data[j] = -1;
+            } else {
+                out_data[j] = 0;
+            }
+        }
+
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak
new file mode 100644
index 0000000000..7a65548bed
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak
@@ -0,0 +1,75 @@
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#include "ap_fixed.h"
+
+// This is a substitute for "ceil(n/(float)d)".
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n > d ? n : d)
+
+#define STRINGIFY(x) #x
+#define EXPAND_STRING(x) STRINGIFY(x)
+
+#ifndef __VITIS_HLS__
+#define DATA_PACK_TXT HLS DATA_PACK variable =
+#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
+#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
+#else
+#define PRAGMA_DATA_PACK(variable)
+#endif
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+enum strategy { latency, resource };
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Vivado cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_and {
+  public:
+    T operator()(T a, T b) { return a && b; }
+};
+
+template <class T> class Op_or {
+  public:
+    T operator()(T a, T b) { return a || b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+template <class T> class Op_min {
+  public:
+    T operator()(T a, T b) { return a <= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak
new file mode 100644
index 0000000000..8ee579ccf2
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak
@@ -0,0 +1,66 @@
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_latency.h"
+#include "nnet_conv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv1d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 0;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    //#pragma HLS INLINE region
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak
new file mode 100644
index 0000000000..3ec7605df1
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak
@@ -0,0 +1,89 @@
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void compute_scaled_indices_1d(const unsigned w_idx, ap_uint<CONFIG_T::filt_width> *pixel_idx) {
+    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
+
+ComputeIndex:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        //#pragma HLS UNROLL
+        unsigned sw_idx =
+            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
+                wp_idx + p);
+        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        //#pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
+        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
+                                                        biases, pixel_idx);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+        //#pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak
new file mode 100644
index 0000000000..5291fad408
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak
@@ -0,0 +1,75 @@
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_latency.h"
+#include "nnet_conv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv2d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    //#pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    //#pragma HLS INLINE region
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak
new file mode 100644
index 0000000000..b1af08a080
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak
@@ -0,0 +1,89 @@
+#ifndef NNET_CONV2D_LATENCY_H_
+#define NNET_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_latency_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    //#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    //#pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+    //#pragma HLS ARRAY_PARTITION variable=weights complete
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            //#pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                //#pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    //#pragma HLS UNROLL
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                //#pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                //#pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    //#pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                //#pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak
new file mode 100644
index 0000000000..ee723f74e9
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak
@@ -0,0 +1,60 @@
+#ifndef NNET_DENSE_H_
+#define NNET_DENSE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense_latency.h"
+#include "nnet_dense_resource.h"
+#include "nnet_dense_seq.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    //#pragma HLS inline
+    if (CONFIG_T::seq_len > 1) {
+        dense_seq<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        } else {
+            dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        }
+    }
+
+    //    std::cout << "out Dense: " << std::endl;
+    //    for(int i=0; i < CONFIG_T::n_out*CONFIG_T::seq_len; ++i) {
+    //        std::cout << res[i] << " ";
+    //    }
+    //    std::cout << std::endl;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak
new file mode 100644
index 0000000000..02e56e532b
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak
@@ -0,0 +1,90 @@
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+template <typename CONFIG_T>
+void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
+               typename CONFIG_T::accum_t weight) {
+    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
+        //#pragma HLS UNROLL
+        if (k == index)
+            mult[k] += weight;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc    complete
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
+
+#ifdef __VITIS_HLS__
+    //#pragma HLS AGGREGATE variable=weights
+#else
+    //#pragma HLS data_pack variable=weights struct_level
+#endif
+
+InitAccum:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        //#pragma HLS UNROLL
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    // Do the compressed matrix-multiply
+    const int rufactor = CONFIG_T::reuse_factor;
+ReuseLoop:
+    for (unsigned ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE  II=1 rewind
+
+        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
+        //#pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
+            //#pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    CompressedMultLoop:
+        for (unsigned im = 0; im < multiplier_limit; im++) {
+            //#pragma HLS UNROLL
+            unsigned w = im * rufactor + ir;
+            auto row = weights[w].row_index;
+            auto col = weights[w].col_index;
+            auto weight_cache = weights[w].weight;
+            data_T data_cache = data[row];
+            // mult[col] += weight_cache * data_cache;
+            typename CONFIG_T::accum_t prod =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
+            fill_mult<CONFIG_T>(col, mult, prod);
+        }
+
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += mult[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        //#pragma HLS UNROLL
+        // res[i] = (res_T) (acc[i]);
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak
new file mode 100644
index 0000000000..81c137e54e
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak
@@ -0,0 +1,72 @@
+#ifndef NNET_DENSE_LATENCY_H_
+#define NNET_DENSE_LATENCY_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    data_T cache;
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=weights,biases
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+    //#pragma HLS ARRAY_PARTITION variable=mult complete
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Do the matrix-multiply
+Product1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        cache = data[ii];
+    Product2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
+        }
+    }
+
+// Initialize accumulator with input biases
+ResetAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+// Accumulate multiplication result
+Accum1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+    Accum2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            acc[jj] += mult[index];
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        // res[ires] = (res_T) (acc[ires]);
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak
new file mode 100644
index 0000000000..17ef1930fa
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak
@@ -0,0 +1,263 @@
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <assert.h>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        //#pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE II=1 rewind
+
+        int w_index = ir;
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            // Increment w_index
+            w_index += rufactor;
+            // Increment in_index
+            in_index += rufactor;
+            if (in_index >= nin) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        //#pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        //#pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+    int w_index;
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = rufactor / nin;
+
+    int outidx[rufactor];
+IndexLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % nin == 0) {
+            outstep++;
+        }
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE II=1 rewind
+
+        w_index = ir;
+        out_index = outidx[ir] /*outstep*/;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            w_index += rufactor;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                break; // check out of bounds
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= nin) {
+            in_index = 0;
+            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        //#pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
+
+    //#pragma HLS function_instantiate variable=weights,biases
+    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    //#pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    //#pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        //#pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        //#pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        //#pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int in_index = w_index % nin;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue; // check out of bounds
+            tmpmult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        //#pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            //#pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            //#pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int out_index = w_index / multfactor;
+            if (out_index >= multiplier_limit)
+                continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+    AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            //#pragma HLS UNROLL
+            // int out_index = im/multscale; // This is the general case
+            // acc[out_index] += mult[im];
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        //#pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    //#pragma HLS INLINE recursive
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak
new file mode 100644
index 0000000000..53b9ec480b
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak
@@ -0,0 +1,44 @@
+#ifndef NNET_DENSE_SEQ_H_
+#define NNET_DENSE_SEQ_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_seq(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
+               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    //#pragma HLS inline
+
+    data_T in_val[CONFIG_T::n_in];
+    //#pragma HLS ARRAY_PARTITION variable=in_val complete
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            for (int i = 0; i < CONFIG_T::n_in; ++i) {
+                //#pragma HLS UNROLL
+                in_val[i] = data[j * CONFIG_T::n_in + i];
+            }
+            dense_latency<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
+        }
+    } else {
+        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+            for (int i = 0; i < CONFIG_T::n_in; ++i) {
+                //#pragma HLS UNROLL
+                in_val[i] = data[j * CONFIG_T::n_in + i];
+            }
+            dense_resource<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak
new file mode 100644
index 0000000000..adcbad6afb
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak
@@ -0,0 +1,816 @@
+#ifndef NNET_GARNET_H_
+#define NNET_GARNET_H_
+
+#include "hls_math.h"
+#include "hls_stream.h"
+#include "nnet_common.h"
+
+namespace nnet {
+namespace garnet_utils {
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+
+    index_t index;
+    typename CONFIG_T::distance_t distance;
+
+    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
+    edge_weights_table[0] = ap_ufixed<CONFIG_T::edge_weight_t::width, 0, AP_TRN, AP_SAT>(1.);
+
+    for (unsigned iw = 1; iw < table_size; ++iw) {
+        index = iw;
+        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
+        edge_weights_table[iw] = hls::exp(-distance * distance);
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    typename CONFIG_T::distance_t v = -32.;
+    for (unsigned iw = 0; iw < table_size; ++iw) {
+        edge_weights_table[iw] = std::exp(-v * v);
+        v += step;
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T>
+inline
+    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    int index = (distance + 32.) / step;
+    if (index < 0)
+        index = 0;
+    else if (index >= table_size)
+        index = table_size - 1;
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
+    if (CONFIG_T::is_stack) {
+        //#pragma HLS INLINE OFF
+    }
+#ifdef __SYNTHESIS__
+    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
+    // CONFIG_T::reuse_factor);
+    // //#pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
+    bool initialized = false;
+#else
+    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    static bool initialized = false;
+#endif
+    if (not initialized) {
+        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
+        initialized = true;
+    }
+
+    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                  exponent_T exponent) {
+    //#pragma HLS INLINE
+    return dividend >> exponent;
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                      exponent_T exponent) {
+    //#pragma HLS INLINE
+    return dividend / std::pow(2., exponent);
+}
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
+
+    Means() {
+        //#pragma HLS INLINE
+        //#pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
+        //#pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
+        //#pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = 0.;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] = 0.;
+            }
+        }
+    }
+
+    void set_weight(unsigned, edge_weight_t const &) {
+        //#pragma HLS INLINE
+    }
+
+    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
+        //#pragma HLS INLINE
+        // Always called within a pipelined region - no UNROLL needed
+
+        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        //#pragma HLS INLINE
+        //#pragma HLS UNROLL region
+
+        // accum comes divided by unroll factor
+        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        //#pragma HLS INLINE
+        //#pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+
+            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
+            }
+        }
+    }
+};
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+
+    WeightsAndMeans() : Means<CONFIG_T, E>() {
+        //#pragma HLS INLINE
+        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
+        //#pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
+    }
+
+    void set_weight(unsigned iva, edge_weight_t const &weight) {
+        //#pragma HLS INLINE
+        edge_weights[iva] = weight;
+    }
+};
+
+template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t const (&output_biases)[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
+        //#pragma HLS INLINE
+    }
+};
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t output_biases[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const nvtx) {
+        //#pragma HLS ARRAY_PARTITION variable=output_biases complete
+        //#pragma HLS UNROLL region
+
+        // Cannot add a loop label here due to a Vivado HLS bug, apparently
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
+            bias *= nvtx;
+            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
+        }
+    }
+};
+
+template <class CONFIG_T, class data_T> struct InputDataGetter {
+    typedef data_T data_t;
+
+    data_T const *dataref;
+
+    InputDataGetter(data_T const *d) : dataref{d} {
+        //#pragma HLS INLINE
+    }
+    data_T const &get(unsigned iv, unsigned ix) const {
+        //#pragma HLS INLINE
+        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+        return dataref[ivx];
+    }
+};
+
+template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
+    typedef data_T data_t;
+
+    data_T const (&dataref)[CONFIG_T::n_in_features];
+
+    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
+        //#pragma HLS INLINE
+    }
+    data_T const &get(unsigned, unsigned ix) const {
+        //#pragma HLS INLINE
+        return dataref[ix];
+    }
+};
+
+template <class CONFIG_T, class res_T> struct OutputResSetter {
+    typedef res_T res_t;
+
+    res_T *resref;
+
+    OutputResSetter(res_T *r) : resref{r} {
+        //#pragma HLS INLINE
+    }
+    void set(unsigned iv, unsigned io, res_T const &acc) {
+        //#pragma HLS INLINE
+        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+        resref[ivo] = acc;
+    }
+};
+
+template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
+    typedef res_T res_t;
+
+    res_T (&resref)[CONFIG_T::n_out_features];
+
+    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
+        //#pragma HLS INLINE
+    }
+    void set(unsigned, unsigned io, res_T const &acc) {
+        //#pragma HLS INLINE
+        resref[io] = acc;
+    }
+};
+
+template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
+inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
+                                       arrays_T &arrays) {
+    //#pragma HLS INLINE
+
+Aggregators:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+    InFeatures1:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
+
+            distance += incr;
+        }
+
+        typename CONFIG_T::edge_weight_t edge_weight =
+            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
+
+        arrays_local.edge_weight_mean[ia] += edge_weight;
+
+    InFeatures2:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
+
+            arrays_local.weighted_feature_mean[iax] += incr;
+        }
+
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+        arrays.set_weight(iva, edge_weight);
+    }
+}
+
+template <class CONFIG_T, class arrays_T>
+inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
+    //#pragma HLS INLINE
+    //#pragma HLS UNROLL region
+
+    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
+
+InFeatures:
+    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
+        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
+    }
+
+    return aggr;
+}
+
+template <class CONFIG_T, class arrays_T>
+inline void compute_output_base(arrays_T const &arrays,
+                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
+    //#pragma HLS INLINE
+    //#pragma HLS UNROLL region
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+        }
+    }
+}
+
+template <class CONFIG_T, class arrays_T, class res_setter_T>
+inline void
+compute_vertex_output(arrays_T const &arrays, unsigned iv,
+                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
+                      res_setter_T &res_setter) {
+    //#pragma HLS INLINE
+
+    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
+    //#pragma HLS ARRAY_PARTITION variable=edge_weights complete
+
+Aggregators1:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+        edge_weights[ia] = arrays.edge_weights[iva];
+    }
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
+
+    Aggregators2:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
+            acc += incr;
+        }
+
+        res_setter.set(iv, io, acc);
+    }
+}
+
+template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
+void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
+    InputDataGetter<CONFIG_T, data_T> data_getter(data);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        //#pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
+void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    OutputResSetter<CONFIG_T, res_T> res_setter(res);
+
+    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
+    //#pragma HLS ARRAY_PARTITION variable=output_base complete
+
+    compute_output_base<CONFIG_T>(arrays, output_base);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        //#pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
+        }
+    }
+}
+
+template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
+void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
+                res_T res[CONFIG_T::n_out_features]) {
+    //#pragma HLS PIPELINE
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        res_T acc = output_transform_biases.output_biases[io];
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+
+            acc += arrays.edge_weight_mean[ia] * aggr;
+        }
+
+        res[io] = acc;
+    }
+}
+
+template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
+void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
+    typedef typename prev_layer_t::output_t data_T;
+
+    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
+    //#pragma HLS ARRAY_PARTITION variable=prev_output_base complete
+
+    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
+
+    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
+
+    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
+        //#pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            data_T data[prev_layer_t::n_out_features];
+            //#pragma HLS ARRAY_PARTITION variable=data complete
+
+            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
+
+            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
+
+            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
+
+            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    current_arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    //#pragma HLS INLINE
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    //#pragma HLS INLINE
+
+    WeightsAndMeans<current_layer_t> current_arrays;
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
+
+    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
+}
+} // namespace garnet_utils
+
+struct garnet_config {
+    // Layer specs
+    static const unsigned n_vertices_width = 8;
+    static const unsigned n_vertices = (1 << n_vertices_width);
+    static const unsigned n_in_features = 4;
+    static const unsigned n_propagate = 4;
+    static const unsigned n_aggregators = 4;
+    static const unsigned n_out_features = 4;
+    static const unsigned distance_width = 12;
+
+    // Internal data type definitions
+    typedef float input_transform_weights_t;
+    typedef float input_transform_biases_t;
+    typedef float output_transform_weights_t;
+    typedef float output_transform_biases_t;
+    typedef float aggregator_distance_weights_t;
+    typedef float aggregator_distance_biases_t;
+
+    typedef float norm_t;
+    typedef float distance_t;
+    typedef float edge_weight_t;
+    typedef float edge_weight_aggr_t;
+    typedef float aggr_t;
+    typedef float output_t;
+
+    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
+    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
+    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
+    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
+    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
+
+    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
+
+    static const unsigned output_collapse = no_collapse;
+
+    static const bool mean_by_nvert = false;
+    static const bool is_stack = false;
+
+    // Optimization specs
+    static const unsigned reuse_factor = 64;
+    static const unsigned log2_reuse_factor = 6;
+};
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    garnet_utils::Means<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
+}
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_out_features]) {
+    //#pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
+}
+
+/* Reference (dumb) implementation returning (Vertices, Features) */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
+
+                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
+            }
+        }
+
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
+            }
+
+            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
+        }
+    }
+
+    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            aggregated_features[iap] = 0.;
+
+            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+                if (iv == nvtx[0])
+                    break;
+
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
+            }
+        }
+    }
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            if (CONFIG_T::mean_by_nvert)
+                aggregated_features[iap] /= nvtx[0];
+            else {
+                // Not using right shift in case aggr_t is float or double
+                aggregated_features[iap] /= CONFIG_T::n_vertices;
+            }
+        }
+    }
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
+
+            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+                typename CONFIG_T::aggr_t aggr = 0.;
+
+                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
+
+                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
+                }
+
+                acc += edge_weights[iva] * aggr;
+            }
+
+            res[ivo] = acc;
+        }
+    }
+}
+
+/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_out_features]) {
+    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
+
+    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
+
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename CONFIG_T::aggr_t acc = 0.;
+
+        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+            if (iv == nvtx[0])
+                break;
+
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            acc += vertex_res[ivo];
+        }
+
+        if (CONFIG_T::mean_by_nvert)
+            acc /= nvtx[0];
+        else {
+            // Not using right shift in case aggr_t is float or double
+            acc /= CONFIG_T::n_vertices;
+        }
+
+        res[io] = acc;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak
new file mode 100644
index 0000000000..1a3a3d28b5
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak
@@ -0,0 +1,382 @@
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include "hls_stream.h"
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace nnet {
+
+#ifndef __SYNTHESIS__
+
+#ifndef WEIGHTS_DIR
+#define WEIGHTS_DIR "weights"
+#endif
+
+template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+
+        size_t i = 0;
+        while (std::getline(iss, token, ',')) {
+            std::istringstream(token) >> w[i];
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].sign >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stream<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j]);
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = save_T(data[i]);
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(hls::stream<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = save_T(ctype[j]);
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << float(data[i]) << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(hls::stream<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << float(ctype[j]) << " "; // We don't care about precision in text files
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+#endif
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+    std::copy(in_begin, in_end, dst);
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
+void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+
+    size_t i_pack = 0;
+    dst_T dst_pack;
+    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
+        dst_pack[i_pack++] = typename dst_T::value_type(*i);
+        if (i_pack == dst_T::size) {
+            i_pack = 0;
+            dst.write(dst_pack);
+        }
+    }
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+    for (auto i = 0; i < SIZE; i++)
+        if (i == SIZE - 1) {
+            dst[i].data = src[i];
+            dst[i].last = 1;
+        } else {
+            dst[i].data = src[i];
+            dst[i].last = 0;
+        }
+}
+
+template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE; i++) {
+        out << result[i] << " ";
+    }
+    out << std::endl;
+}
+
+template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / res_T::size; i++) {
+        res_T res_pack = result.read();
+        for (int j = 0; j < res_T::size; j++) {
+            out << res_pack[j] << " ";
+        }
+        if (keep)
+            result.write(res_pack);
+    }
+    out << std::endl;
+}
+
+template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
+
+template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < data_T::size; j++) {
+            data_pack[j] = 0.;
+        }
+        data.write(data_pack);
+    }
+}
+
+template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        if (fscanf(fp, "%f\n", &newval) != 0) {
+            data[ii] = newval;
+        } else {
+            return -2;
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class dataType, unsigned int nrows, unsigned int ncols>
+int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        for (int jj = 0; jj < ncols; jj++) {
+            if (fscanf(fp, "%f\n", &newval) != 0) {
+                data[ii][jj] = newval;
+            } else {
+                return -2;
+            }
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class in_T, class out_T, int N_IN> void change_type(hls::stream<in_T> &in, hls::stream<out_T> &out) {
+    in_T datareg;
+    hls::stream<out_T> input_trunc;
+    for (int ii = 0; ii < N_IN; ii++) {
+        out << (out_T)in.read();
+    }
+}
+
+template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &data, hls::stream<data_T> &res) {
+    data_T datareg;
+    for (int ii = 0; ii < N_IN; ii++) {
+        datareg = data.read();
+        std::cout << "[" << ii << "]: " << datareg << std::endl;
+        res << datareg;
+    }
+}
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak
new file mode 100644
index 0000000000..ef8172f297
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak
@@ -0,0 +1,404 @@
+//
+//    rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
+//
+//    Copyright (C) 2017 EJ Kreinar
+//
+//    This program is free software: you can redistribute it and/or modify
+//    it under the terms of the GNU General Public License as published by
+//    the Free Software Foundation, either version 3 of the License, or
+//    (at your option) any later version.
+//
+//    This program is distributed in the hope that it will be useful,
+//    but WITHOUT ANY WARRANTY; without even the implied warranty of
+//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+//    GNU General Public License for more details.
+//
+//    You should have received a copy of the GNU General Public License
+//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef NNET_LAYERNORM_H_
+#define NNET_LAYERNORM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <iostream>
+#include <math.h>
+
+#include "hls_math.h"
+// #include "ap_fixed.h"
+
+namespace nnet {
+
+struct layernorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+    typedef ap_fixed<16, 8> mean_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 20;
+    static const unsigned seq_len = 4;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    float inv_range = CONFIG_T::table_range;
+    // Inversion function:
+    //   result = 1/sqrt(x)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
+        float in_val = inv_range * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0)
+            table_out[ii] = 1.0 / sqrt(in_val);
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+template <typename CONFIG_T, int N_TABLE> void init_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
+    float inv_range = 0.5; /// if not acurrate increase this
+    // Inversion function:
+    //   result = 1/sqrt(x)
+    for (int ii = 0; ii < N_TABLE; ii++) {
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
+        float in_val = inv_range * ii / float(N_TABLE);
+        // Next, compute lookup table function
+        if (in_val > 0.0)
+            table_out[ii] = sqrt(in_val);
+        else
+            table_out[ii] = 0.0;
+    }
+}
+
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+// )
+// {
+// //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+// //#pragma HLS ARRAY_PARTITION variable=data complete
+// //#pragma HLS ARRAY_PARTITION variable=res complete
+
+// int inv_range_inv = (int) 1/ 0.5;
+// typename CONFIG_T::table_t sqr = 0;
+// #ifdef __HLS_SYN__
+//     bool initialized = false;
+//     typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
+// #else
+//     static bool initialized = false;
+//     static typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
+// #endif
+//     if (!initialized) {
+//         init_sqr_table<CONFIG_T, CONFIG_T::table_size>(sqr_table);
+//         initialized = true;
+//     }
+
+//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//     data_T sum_cache = 0;
+//     data_T sum_cache2 = 0;
+//     data_T var, mean, diff, inv_sqr;
+//     data_T data_diff[dim];
+//     data_T data_norm[dim];
+
+//     //#pragma HLS ARRAY_PARTITION variable=data_diff complete
+//     //#pragma HLS ARRAY_PARTITION variable=data_diff complete
+
+//     const data_T k_inv = 1.0/dim;
+//     for (int i = 0; i < dim; ++i){
+//         sum_cache += data[i];
+//     }
+//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+
+//     for (int i = 0; i < dim; ++i){
+//         data_diff[i] = data[i] - mean;
+//         diff = data_diff[i]*data_diff[i];
+//         sum_cache2 += diff;
+//     }
+//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+
+//     int index = var*(CONFIG_T::table_size)*inv_range_inv;
+// 	if (index < 0)   index = 0;
+// 	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+// 	sqr = (typename CONFIG_T::table_t) sqr_table[index];
+//     inv_sqr = 1 / sqr;
+
+//     for (int i = 0; i < dim; ++i){
+//         res[i] = data_diff[i] * inv_sqr * scale[i] + bias[i];
+//     }
+
+// }
+
+//////////////////////
+// Dennis's version //
+//////////////////////
+template <class data_T, class res_T, typename CONFIG_T>
+void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CONFIG_T::n_in / CONFIG_T::seq_len],
+                  typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
+                  typename CONFIG_T::bias_t bias[CONFIG_T::n_in / CONFIG_T::seq_len]) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    //#pragma HLS ARRAY_PARTITION variable=data complete
+    //#pragma HLS ARRAY_PARTITION variable=res complete
+    int inv_range_inv = (int)1 / CONFIG_T::table_range;
+    typename CONFIG_T::table_t deno_inver = 0;
+#ifdef __HLS_SYN__
+    bool initialized = false;
+    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+#else
+    static bool initialized = false;
+    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+#endif
+    if (!initialized) {
+        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
+        initialized = true;
+    }
+
+    static const unsigned dim = CONFIG_T::n_in / CONFIG_T::seq_len;
+    typename CONFIG_T::mean_t sum_cache = 0;
+    typename CONFIG_T::mean_t sum_cache2 = 0;
+    typename CONFIG_T::mean_t var, mean, diff;
+    typename CONFIG_T::mean_t data_diff[dim];
+    typename CONFIG_T::mean_t data_norm[dim];
+    //    data_T sum_cache = 0;
+    //    data_T sum_cache2 = 0;
+    //    data_T var, mean, diff;
+    ////    typename CONFIG_T::mean_t mean;
+    ////    typename CONFIG_T::var_t var;
+    ////    typename CONFIG_T::diff_t diff;
+    //    data_T data_diff[dim];
+    //    data_T data_norm[dim];
+
+    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
+    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
+
+    const typename CONFIG_T::mean_t k_inv = 1.0 / dim;
+    for (int i = 0; i < dim; ++i) {
+        sum_cache += static_cast<typename CONFIG_T::mean_t>(data[i]);
+    }
+    mean = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache, k_inv);
+    //    std::cout << "mean: " << std::endl;
+    //    std::cout << mean << std::endl;
+
+    for (int i = 0; i < dim; ++i) {
+        data_diff[i] = static_cast<typename CONFIG_T::mean_t>(data[i]) - mean;
+        diff = data_diff[i] * data_diff[i];
+        sum_cache2 += diff;
+        //        std::cout << "data_diff: " << std::endl;
+        //        std::cout << data_diff[i] << std::endl;
+        //        std::cout << " " << std::endl;
+    }
+    var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
+    //    std::cout << "var: " << std::endl;
+    //    std::cout << var << std::endl;
+    //    std::cout << " " << std::endl;
+
+    int index = var * (CONFIG_T::table_size)*inv_range_inv;
+    if (CONFIG_T::table_range > 1)
+        index = var * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
+
+    if (index < 0)
+        index = 0;
+    if (index > CONFIG_T::table_size - 1)
+        index = CONFIG_T::table_size - 1;
+    deno_inver = (typename CONFIG_T::table_t)invert_sqr_table[index];
+    //    std::cout << "deno_inver: " << std::endl;
+    //    std::cout << deno_inver << std::endl;
+    //    std::cout << " " << std::endl;
+
+    //    std::cout << "index: " << std::endl;
+    //    std::cout << index << std::endl;
+    //    std::cout << " " << std::endl;
+
+    for (int i = 0; i < dim; ++i) {
+        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
+    }
+}
+////////////////////////
+// Original One Ethan's//
+////////////////////////
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+//)
+//{
+////#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+////#pragma HLS ARRAY_PARTITION variable=data complete
+////#pragma HLS ARRAY_PARTITION variable=res complete
+//
+// int inv_range_inv = (int) 1/ CONFIG_T::table_range;
+// typename CONFIG_T::table_t deno_inver = 0;
+//#ifdef __HLS_SYN__
+//    bool initialized = false;
+//    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+//#else
+//    static bool initialized = false;
+//    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+//#endif
+//    if (!initialized) {
+//        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
+//        initialized = true;
+//    }
+//
+//    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//    data_T sum_cache = 0;
+//    data_T sum_cache2 = 0;
+//    data_T var, mean, diff;
+//    data_T data_diff[dim];
+//    data_T data_norm[dim];
+//
+//    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
+//    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
+//
+//    const data_T k_inv = 1.0/dim;
+//    for (int i = 0; i < dim; ++i){
+//        sum_cache += data[i];
+//    }
+////    mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+////    std::cout << "mean: " << std::endl;
+////    std::cout << mean << std::endl;
+//
+//    for (int i = 0; i < dim; ++i){
+//        data_diff[i] = data[i] - mean;
+//        diff = data_diff[i]*data_diff[i];
+//        sum_cache2 += diff;
+////        std::cout << "data_diff: " << std::endl;
+////        std::cout << data_diff[i] << std::endl;
+////        std::cout << " " << std::endl;
+//    }
+//    var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+////    std::cout << "var: " << std::endl;
+////    std::cout << var << std::endl;
+////    std::cout << " " << std::endl;
+//
+//    int index = var*(CONFIG_T::table_size)*inv_range_inv;
+//    if (CONFIG_T::table_range > 1) index = var*(CONFIG_T::table_size)/ (int)CONFIG_T::table_range;
+//
+//	if (index < 0)   index = 0;
+//	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
+//	deno_inver = (typename CONFIG_T::table_t) invert_sqr_table[index];
+////    std::cout << "deno_inver: " << std::endl;
+////    std::cout << deno_inver << std::endl;
+////    std::cout << " " << std::endl;
+//
+////    std::cout << "index: " << std::endl;
+////    std::cout << index << std::endl;
+////    std::cout << " " << std::endl;
+//
+//    for (int i = 0; i < dim; ++i){
+//        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
+//    }
+//
+//}
+
+// template<class data_T, class res_T, typename CONFIG_T>
+// void layernorm_1d(
+//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
+//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
+// )
+// {
+// //#pragma HLS PIPELINE
+// //#pragma HLS ARRAY_PARTITION variable=data complete
+// //#pragma HLS ARRAY_PARTITION variable=res complete
+
+//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
+//     data_T sum_cache = 0;
+//     data_T sum_cache2 = 0;
+//     data_T var, mean, diff_squares, diff, var_eps_inv;
+//     data_T data_diff[dim];
+//     float sqrt_var_eps;
+
+//     //#pragma HLS ARRAY_PARTITION variable=data_diff complete
+
+//     const data_T k_inv = 1.0/dim;
+//     for (int i = 0; i < dim; ++i){
+//         sum_cache += data[i];
+//     }
+//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
+//     // std::cout << "mean: " << std::endl;
+//     // std::cout << mean << std::endl;
+
+//     for (int i = 0; i < dim; ++i){
+//         diff = data[i] - mean;
+//         data_diff[i] = diff;
+//         diff_squares = diff*diff;
+//         sum_cache2 += diff_squares;
+//         // std::cout << "data_diff: " << std::endl;
+//         // std::cout << data_diff[i] << std::endl;
+//         // std::cout << " " << std::endl;
+//     }
+//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
+//     float var_f = (float)var;
+//     // std::cout << "var: ";
+//     // std::cout << var << std::endl;
+
+//     sqrt_var_eps = sqrt(var_f);
+//     var_eps_inv = (data_T) (1 / (sqrt_var_eps));
+//     // std::cout << "var_eps_inv: " << std::endl;
+//     // std::cout << var_eps_inv << std::endl;
+//     // std::cout << " " << std::endl;
+
+//     for (int i = 0; i < dim; ++i){
+//         res[i] = data_diff[i] * var_eps_inv * scale[i] + bias[i];
+//     }
+
+// }
+
+template <class data_T, class res_T, typename CONFIG_T>
+void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+                    typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
+                    typename CONFIG_T::bias_t bias[CONFIG_T::n_in / CONFIG_T::seq_len]) {
+    static const unsigned dim = CONFIG_T::n_in / CONFIG_T::seq_len;
+    data_T in_val[dim];
+    data_T outval[dim];
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    //#pragma HLS function_instantiate variable=scale,bias
+
+    // //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    //#pragma HLS ARRAY_PARTITION variable=scale complete
+    //#pragma HLS ARRAY_PARTITION variable=bias complete
+    //#pragma HLS ARRAY_PARTITION variable=in_val complete
+    //#pragma HLS ARRAY_PARTITION variable=outval complete
+
+    // std::cout << "one seq norm layer: " << std::endl;
+    // std::cout << " " << std::endl;
+
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        //#pragma HLS PIPELINE
+    load:
+        for (int i = 0; i < dim; ++i) {
+            //#pragma HLS UNROLL
+            in_val[i] = data[j * dim + i];
+        }
+        layernorm_1d<data_T, res_T, CONFIG_T>(in_val, outval, scale, bias);
+    store:
+        for (int i = 0; i < dim; ++i) {
+            //#pragma HLS UNROLL
+            res[j * dim + i] = outval[i];
+        }
+    }
+
+    //     std::cout << "out Dense: " << std::endl;
+    //     nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
+    //     std::cout << " " << std::endl;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak
new file mode 100644
index 0000000000..c6f6dbbf95
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak
@@ -0,0 +1,256 @@
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+    static const unsigned reuse_factor = 1;
+    typedef float accum_t;
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] + data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] - data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] * data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+    //#pragma HLS ARRAY_PARTITION variable=mult complete
+    typename CONFIG_T::accum_t acc = 0;
+
+Product:
+    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
+        //#pragma HLS UNROLL
+        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
+    }
+
+Accum:
+    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
+        //#pragma HLS UNROLL
+        acc += mult[i_acc];
+    }
+
+    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
+        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
+                data2[ii * CONFIG_T::n_elem2_1 + jj];
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    //#pragma HLS INLINE
+
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx =
+                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
+                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    //#pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    //#pragma HLS INLINE
+
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak
new file mode 100644
index 0000000000..5cc89659fe
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak
@@ -0,0 +1,370 @@
+#ifndef NNET_MERGE_STREAM_H_
+#define NNET_MERGE_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AddLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AddPack:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = in_data1[j] + in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+SubtractLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SubtractPack:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = in_data1[j] - in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MultiplyLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MultiplyPack:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = in_data1[j] * in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AverageLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AveragePack:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MaximumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MaximumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MinimumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MinimumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                //#pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                //#pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                //#pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                //#pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            //#pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                //#pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                //#pragma HLS UNROLL
+                out_data[input1_T::size + k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        //#pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            //#pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+        res.write(out_data);
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        //#pragma HLS PIPELINE II=1
+
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            //#pragma HLS UNROLL
+            out_data[k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        //#pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            //#pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            //#pragma HLS UNROLL
+            out_data[input1_T::size + k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    res_T out_data;
+    PRAGMA_DATA_PACK(out_data)
+ConcatLoop1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
+        //#pragma HLS PIPELINE
+        input1_T in_data1 = data1.read();
+    ConcatPack1:
+        for (int j = 0; j < input1_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j + (i * input1_T::size)] = in_data1[j];
+        }
+    }
+ConcatLoop2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
+        //#pragma HLS PIPELINE
+        input2_T in_data2 = data2.read();
+    ConcatPack2:
+        for (int j = 0; j < input2_T::size; j++) {
+            //#pragma HLS UNROLL
+            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j];
+        }
+    }
+    res.write(out_data);
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak
new file mode 100644
index 0000000000..24979806df
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak
@@ -0,0 +1,116 @@
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+namespace product {
+
+/* ---
+ * different methods to perform the product of input and weight, depending on the
+ * types of each.
+ * --- */
+
+class Product {};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        //#pragma HLS INLINE
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        //#pragma HLS INLINE
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        //#pragma HLS INLINE
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        //#pragma HLS INLINE
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        //#pragma HLS INLINE
+        return a * w;
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
+    static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        //#pragma HLS INLINE
+
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+
+} // namespace product
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ap_uint<1>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value,
+                               ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<
+    std::is_same<data_T, ap_uint<1>>::value && !std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value, res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ap_uint<1>>::value), res_T>::type cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak
new file mode 100644
index 0000000000..20be3c74d6
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak
@@ -0,0 +1,337 @@
+#ifndef NNET_MHT_H_
+#define NNET_MHT_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_mult.h"
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+struct multiheadattention_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned num_heads = 10;
+    static const unsigned head_dim_key = 10;
+    static const unsigned head_dim_value = 10;
+    static const unsigned feature_dim = 20;
+    static const unsigned seq_len = 500;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <int PackSize, class data_T> struct datapack { data_T data[PackSize]; };
+
+template <class data_T, int size> void read_stream_array(hls::stream<data_T> data_in[size], data_T out[size]) {
+    for (int k = 0; k < size; ++k) {
+        //#pragma HLS UNROLL
+        out[k] = data_in[k].read();
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void matrixmul_transpose(hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
+                         hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
+                         res_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
+{
+    const data_T dk = 1.0 / sqrt(CONFIG_T::head_dim_key);
+    data_T QK_1;
+    typename CONFIG_T::accum_t QKij;
+    data_T Qi[CONFIG_T::head_dim_key];
+    data_T Product[CONFIG_T::seq_len]; // seq_Q, seq_K
+    data_T qk_smout[CONFIG_T::seq_len];
+    data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
+    //#pragma HLS ARRAY_PARTITION variable=Qi complete
+    //#pragma HLS ARRAY_PARTITION variable=Product complete
+    //#pragma HLS ARRAY_PARTITION variable=qk_smout complete
+    //#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
+    //#pragma HLS ARRAY_PARTITION variable=krow complete
+
+    datapack<CONFIG_T::head_dim_key, data_T> datak_pack, dataq_pack;
+    //#pragma HLS DATA_PACK variable=Q
+    //#pragma HLS DATA_PACK variable=K
+    //#pragma HLS DATA_PACK variable=datak_pack
+    //#pragma HLS DATA_PACK variable=dataq_pack
+
+    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
+    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+
+prep_k:
+    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        datak_pack = K.read();
+        for (int j = 0; j < CONFIG_T::head_dim_key; ++j) {
+            //#pragma HLS UNROLL
+            krow[i * CONFIG_T::head_dim_key + j] = datak_pack.data[j];
+        }
+    }
+
+// for each row and column of AB
+row:
+    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        dataq_pack = Q.read();
+
+    q:
+        for (int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i) {
+            //#pragma HLS UNROLL
+            Qi[q_i] = dataq_pack.data[q_i];
+        }
+    col:
+        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+            // compute (QK)i,j
+            QKij = 0;
+        product:
+            for (int k = 0; k < CONFIG_T::head_dim_key; ++k) {
+                QK_1 = CONFIG_T::template product<data_T, data_T>::product(Qi[k], krow[j * CONFIG_T::head_dim_key + k]);
+                QKij += QK_1;
+            }
+            Product[j] = QKij * dk;
+        }
+        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, qk_smout);
+        for (int n = 0; n < CONFIG_T::seq_len; ++n) {
+            //#pragma HLS UNROLL
+            QK[i][n] = qk_smout[n];
+        }
+    }
+}
+
+/////////
+template <class data_T, class res_T, typename CONFIG_T>
+void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &V,
+               hls::stream<data_T> S[CONFIG_T::head_dim_value]) // S: attention score
+{
+    //#pragma HLS DATA_PACK variable=V
+    //#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
+    //#pragma HLS ARRAY_PARTITION variable=S complete dim=1
+
+    datapack<CONFIG_T::head_dim_key, data_T> datav_pack;
+    //#pragma HLS DATA_PACK variable=datav_pack
+
+    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
+    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+
+    data_T dataV[CONFIG_T::seq_len * CONFIG_T::head_dim_value];
+    //#pragma HLS ARRAY_PARTITION variable = dataV complete dim = 1
+
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        datav_pack = V.read();
+        for (int i = 0; i < CONFIG_T::head_dim_value; ++i) {
+            //#pragma HLS UNROLL
+            dataV[CONFIG_T::seq_len * i + j] = datav_pack.data[i];
+        }
+    }
+
+    // for each row and column of AB
+    data_T Sij, S_1;
+    data_T QKi[CONFIG_T::seq_len];
+    //#pragma HLS ARRAY_Partition variable=QKi complete
+row:
+    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+    qk:
+        for (int q_i = 0; q_i < CONFIG_T::seq_len; ++q_i) {
+            //#pragma HLS UNROLL
+            QKi[q_i] = QK[i][q_i];
+        }
+    col:
+        for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
+            // compute (S)i,j
+            Sij = 0;
+        product:
+            for (int k = 0; k < CONFIG_T::seq_len; ++k) {
+                S_1 = CONFIG_T::template product<data_T, data_T>::product(QKi[k], dataV[j * CONFIG_T::seq_len + k]);
+                Sij += S_1;
+            }
+            S[j].write(Sij);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lin_projection(hls::stream<data_T> data_q[CONFIG_T::feature_dim], hls::stream<data_T> data_vk[CONFIG_T::feature_dim],
+                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &k_proj,
+                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &q_proj,
+                    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> &v_proj,
+                    typename CONFIG_T::weight_t key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
+                    typename CONFIG_T::bias_t key_bias[CONFIG_T::head_dim_key],
+                    typename CONFIG_T::weight_t query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
+                    typename CONFIG_T::bias_t query_bias[CONFIG_T::head_dim_key],
+                    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
+                    typename CONFIG_T::bias_t value_bias[CONFIG_T::head_dim_value])
+
+{
+    //#pragma HLS DATA_PACK variable=k_proj
+    //#pragma HLS DATA_PACK variable=q_proj
+    //#pragma HLS DATA_PACK variable=v_proj
+
+    //#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
+    //#pragma HLS ARRAY_PARTITION variable=data_vk complete dim=1
+
+k_h:
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        //#pragma HLS PIPELINE
+
+        data_T proj_k[CONFIG_T::head_dim_key];
+        data_T proj_q[CONFIG_T::head_dim_key];
+        data_T proj_v[CONFIG_T::head_dim_value];
+        data_T in_q[CONFIG_T::feature_dim];
+        data_T in_v[CONFIG_T::feature_dim];
+        //#pragma HLS ARRAY_PARTITION variable=proj_k complete dim=1
+        //#pragma HLS ARRAY_PARTITION variable=proj_q complete dim=1
+        //#pragma HLS ARRAY_PARTITION variable=proj_v complete dim=1
+        //#pragma HLS ARRAY_PARTITION variable=in_q complete dim=1
+        //#pragma HLS ARRAY_PARTITION variable=in_v complete dim=1
+
+        datapack<CONFIG_T::head_dim_key, data_T> proj_k_pack;
+        datapack<CONFIG_T::head_dim_key, data_T> proj_q_pack;
+        datapack<CONFIG_T::head_dim_value, data_T> proj_v_pack;
+        //#pragma HLS DATA_PACK variable=proj_k_pack
+        //#pragma HLS DATA_PACK variable=proj_q_pack
+        //#pragma HLS DATA_PACK variable=proj_v_pack
+
+        read_stream_array<data_T, CONFIG_T::feature_dim>(data_q, in_q);
+        read_stream_array<data_T, CONFIG_T::feature_dim>(data_vk, in_v);
+
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_k_pack.data, key_weight, key_bias);
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_q, proj_q_pack.data, query_weight, query_bias);
+        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_v_pack.data, value_weight, value_bias);
+
+        k_proj.write(proj_k_pack);
+        q_proj.write(proj_q_pack);
+        v_proj.write(proj_v_pack);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_out(hls::stream<data_T> data_in[CONFIG_T::num_heads][CONFIG_T::head_dim_value],
+               res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+               typename CONFIG_T::weight_t
+                   attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],
+               typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim]) {
+    data_T mat_res_con[CONFIG_T::num_heads * CONFIG_T::head_dim_value];
+    res_T dense_out[CONFIG_T::feature_dim];
+//#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
+//#pragma HLS ARRAY_PARTITION variable=dense_out complete dim=1
+output_dense:
+    for (int k = 0; k < CONFIG_T::seq_len; ++k) {
+
+        //#pragma HLS PIPELINE
+        for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+            //#pragma HLS UNROLL
+            for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
+                //#pragma HLS UNROLL
+                mat_res_con[CONFIG_T::head_dim_value * i + j] = data_in[i][j].read();
+            }
+        }
+        dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, dense_out, attention_output_weight,
+                                                              attention_output_bias);
+        for (int i = 0; i < CONFIG_T::feature_dim; ++i) {
+            //#pragma HLS UNROLL
+            res[CONFIG_T::feature_dim * k + i] = dense_out[i];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void data_prep(data_T data[CONFIG_T::seq_len * CONFIG_T::feature_dim], hls::stream<data_T> d[CONFIG_T::feature_dim]) {
+    //#pragma HLS ARRAY_PARTITION variable=d complete dim=1
+    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
+        for (int k = 0; k < CONFIG_T::feature_dim; ++k) {
+            //#pragma HLS UNROLL
+            d[k].write(data[j * CONFIG_T::feature_dim + k]);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void multiheadattention(
+    data_T data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim], data_T data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
+    typename CONFIG_T::weight_t attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value *
+                                                        CONFIG_T::feature_dim], // num_heads,head_size_v,dim
+    typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim],
+    typename CONFIG_T::weight_t
+        key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // n_head,dim,head_dim
+    typename CONFIG_T::bias_t key_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t
+        query_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // same shape as key
+    typename CONFIG_T::bias_t query_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
+    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
+    typename CONFIG_T::bias_t value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value]) {
+    hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
+    hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
+    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> q_proj[CONFIG_T::num_heads];
+    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> k_proj[CONFIG_T::num_heads];
+    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> v_proj[CONFIG_T::num_heads];
+    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+    hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
+
+    //#pragma HLS DATAFLOW
+    //#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
+    //#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
+    //#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
+    //#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
+    //#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
+    //#pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
+    // std::cout << "input to MHA: " << std::endl;
+    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
+    // std::cout << " " << std::endl;
+
+prepq:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        //#pragma HLS UNROLL
+        nnet::data_prep<data_T, res_T, CONFIG_T>(data_q, d_query[i]);
+    }
+prepvk:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        //#pragma HLS UNROLL
+        nnet::data_prep<data_T, res_T, CONFIG_T>(data_vk, d_value[i]);
+    }
+
+// linear projection
+lin_proj:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        //#pragma HLS UNROLL
+        nnet::lin_projection<data_T, res_T, CONFIG_T>(
+            d_query[i], d_value[i], k_proj[i], q_proj[i], v_proj[i],
+            key_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), key_bias + (CONFIG_T::head_dim_key * i),
+            query_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), query_bias + (CONFIG_T::head_dim_key * i),
+            value_weight + (CONFIG_T::head_dim_value * CONFIG_T::feature_dim * i),
+            value_bias + (CONFIG_T::head_dim_value * i));
+    }
+
+maxtrixmul1:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        //#pragma HLS UNROLL
+        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+    }
+
+maxtrixmul2:
+    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
+        //#pragma HLS UNROLL
+        nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]); // stream
+    }
+
+    nnet::dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
+    //    std::cout << "out MHA: " << std::endl;
+    //    nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(res, std::cout);
+    //    std::cout << " " << std::endl;
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak
new file mode 100644
index 0000000000..d069cc3f5b
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak
@@ -0,0 +1,145 @@
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int j = 0; j < CONFIG_T::n_chan; j++) {
+        for (int i = 0; i < CONFIG_T::pad_left; i++) {
+            *(res++) = 0;
+        }
+
+        for (int i = 0; i < CONFIG_T::in_width; i++) {
+            *(res++) = (res_T) * (data++);
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_right; i++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int k = 0; k < CONFIG_T::n_chan; k++) {
+
+        for (int i = 0; i < CONFIG_T::pad_top; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::in_height; i++) {
+            for (int j = 0; j < CONFIG_T::pad_left; j++) {
+                *(res++) = 0;
+            }
+            for (int j = 0; j < CONFIG_T::in_width; j++) {
+                *(res++) = (res_T) * (data++);
+            }
+            for (int j = 0; j < CONFIG_T::pad_right; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    //#pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak
new file mode 100644
index 0000000000..4611175a68
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak
@@ -0,0 +1,313 @@
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include "nnet_helpers.h"
+#include <iostream>
+
+namespace nnet {
+
+// Return the maximum value from an array
+template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
+    T y = x[0];
+    for (int i = 1; i < N; i++) {
+        y = x[i] > y ? x[i] : y;
+    }
+    return y;
+}
+
+// Return the mean value of an array
+template <typename T, int N, typename accum_t> accum_t avg(T (&x)[N], unsigned length) {
+    accum_t y = 0;
+    for (int i = 0; i < N; i++) {
+        y += x[i];
+    }
+    y /= length;
+    return y;
+}
+
+// Enumeration for pooling operation (max, avg, l2norm pooling)
+enum Pool_Op { Max, Average }; // L2Norm };
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N], unsigned length) {
+    switch (op) {
+    case Max:
+        return max<T, N, accum_t>(x);
+    case Average:
+        return avg<T, N, accum_t>(x, length);
+        // case L2Norm: return l2norm<T, N>(x);
+    }
+}
+
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
+    return pool_op<T, N, op, accum_t>(x, N);
+}
+
+template <typename T, Pool_Op op> T pad_val() {
+    /*---
+     *- In Tensorflow, pooling ignores the value in the padded cells
+     *- For Avg pooling, return 0 (the divisior is modified to the
+     *- area overlapping the unpadded image.
+     *- For max pooling, return the most negative value for the type.
+     *- TODO this is not really generic, it assumes fixed point or integer T
+    ---*/
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+        break;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+struct pooling1d_config {
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
+    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image x in steps of stride
+        for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) {
+            unsigned overlap_pixel = 0;
+            data_T pool[CONFIG_T::pool_width];
+            //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+
+            for (int jj = 0; jj < CONFIG_T::pool_width; jj++) {
+                if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) {
+                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
+                    overlap_pixel++;
+                } else
+                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
+            }
+
+            int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel;
+
+            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool, patch_size);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        data_T pool[CONFIG_T::n_in];
+        //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
+        }
+        // do the pooling
+        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool);
+    }
+}
+
+struct pooling2d_config {
+    // IO size
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+    // Reuse factor
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit() {
+    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+
+                unsigned overlap_pixel = 0;
+
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top;
+                        bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left;
+                        if (cond1 && cond2) {
+                            unsigned data_idx =
+                                ((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) *
+                                    CONFIG_T::n_filt +
+                                ff;
+                            pool[kk * CONFIG_T::stride_width + ll] = data[data_idx];
+                            overlap_pixel++;
+                        } else
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                    }
+                }
+
+                int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel;
+
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                            typename CONFIG_T::accum_t>(pool, patch_size);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) ||
+                            jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad)
+                                img_overlap++;
+                        } else {
+                            pool[kk * CONFIG_T::stride_width + ll] =
+                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
+                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                            typename CONFIG_T::accum_t>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average) {
+                    data_T rescale =
+                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                         res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    const int limit = pool_op_limit<CONFIG_T>();
+    //#pragma HLS ALLOCATION instances=pool_op limit=limit function
+
+FiltLoop:
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+
+    InputLoop:
+        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+            pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<res_T>(
+            pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak
new file mode 100644
index 0000000000..3e1ebb225d
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak
@@ -0,0 +1,56 @@
+#ifndef NNET_RECR_ACTIVATION_H_
+#define NNET_RECR_ACTIVATION_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+namespace activation {
+
+template <class data_T, class res_T, typename CONFIG_T> class Activation {
+  public:
+    // *************************************************
+    //       Blank Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Relu Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Sigmoid Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       TanH Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+} // namespace activation
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak
new file mode 100644
index 0000000000..5ccf2ee570
--- /dev/null
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak
@@ -0,0 +1,586 @@
+#ifndef NNET_RECURSIVE_H_
+#define NNET_RECURSIVE_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recr_activations.h"
+
+namespace nnet {
+
+struct lstm_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_parts = 20;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+// Long Short term Memory NN (LSTM)
+// Resources:
+// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
+// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
+// https://en.wikipedia.org/wiki/Long_short-term_memory
+// Notes:
+//  - LSTM naming conventions adopted from the above links
+//      - s_newstate = activation(U*input + W*state)
+//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
+//  - If softmax is needed on output, perform *outside* this operations
+//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
+//  dense network at the end)
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_newstate, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        //#pragma HLS UNROLL
+        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                 res_T s_newstate[CONFIG_T::n_state],
+                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    static res_T h_state[CONFIG_T::n_state];
+    static res_T s_state[CONFIG_T::n_state];
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    //#pragma HLS ARRAY_PARTITION variable=h_state      complete
+    //#pragma HLS ARRAY_PARTITION variable=s_state      complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    //#pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    if (reset_state) {
+        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
+            //#pragma HLS UNROLL
+            s_state[i_state] = 0;
+            h_state[i_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
+                                                                                    param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+        s_newstate[iacc] = s_state[iacc];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_state, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        //#pragma HLS UNROLL
+        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    res_T h_newstate[CONFIG_T::n_state];
+    res_T s_newstate[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            //#pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                       param_br);
+        else
+            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                //#pragma HLS UNROLL
+                res[i] = h_newstate[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            //#pragma HLS UNROLL
+            res[i] = h_newstate[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    typename res_T::value_type s_newstate[CONFIG_T::n_state];
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    //#pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // //#pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        else
+            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                //#pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+// Struct for the GRU template
+
+struct gru_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_sequence = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+    static const bool pytorch_order = false;
+    static const unsigned n_zeros = 0;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
+                                                                                    // weights - refer page in copy!!
+         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+        else
+            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
+                                       h_newstate[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
+        else
+            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    static res_T h_state[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    //#pragma HLS ARRAY_PARTITION variable=h_state         complete
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    //#pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    //#pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    if (reset_state) {
+        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
+            //#pragma HLS UNROLL
+            h_state[i_h_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+        else
+            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        //#pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
+                                    h_state[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
+        else
+            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    res_T h_state[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    //#pragma HLS ARRAY_PARTITION variable=h_state complete
+    //#pragma HLS ARRAY_PARTITION variable=data_in complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_state[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            //#pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                //#pragma HLS UNROLL
+                res[i] = h_state[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            //#pragma HLS UNROLL
+            res[i] = h_state[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        //#pragma HLS UNROLL
+        h_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // //#pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
+                                                                                         param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                //#pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            //#pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/test/docker/README.md b/test/docker/README.md
new file mode 100644
index 0000000000..0446228a31
--- /dev/null
+++ b/test/docker/README.md
@@ -0,0 +1,72 @@
+# Docker image for hls4ml with Vivado
+
+Extract the Vivado installation archive here and provide the path to license server.
+For example:
+
+```
+docker build --network=host -t hls4ml-with-vivado --build-arg LICENSE_SERVER="1234@myserver" .
+```
+
+By default, version 2018.2 of Vivado is used.
+
+## Using other versions of Vivado
+
+To install specific version of Vivado, first generate the silent installation configuration file from the Vivado installation folder by running:
+
+```
+./xsetup -b configGen
+```
+
+Choose the products/devices the you would like to install by modifying the generated file. Name the file `install_config.txt` and place it in the directory with `Dockerfile`. Edit the `Dockerfile` to add the folder of your Vivado installation and build the image using the command provided above.
+
+## Using the created image
+
+The image can be used in a standard way, e.g., with [`docker run`](https://docs.docker.com/engine/reference/commandline/run/) command:
+
+```
+docker run -it --rm hls4ml-with-vivado
+```
+
+Alternatively, a reusable container can be created and later accessed with [`docker exec`](https://docs.docker.com/engine/reference/commandline/exec/) command:
+
+```
+docker run -dit --name my-hls4ml-container hls4ml-with-vivado
+docker exec -it my-hls4ml-container bash
+```
+
+We recommend using docker volumes to mount the local filesystem into the docker container in order to access files on host from the image.
+
+```
+docker run -it --rm -v /path/on/host:/home/hls4ml/path/in/container hls4ml-with-vivado
+```
+
+Consult the docker [documentation](https://docs.docker.com/storage/volumes/) for more information about volumes.
+
+## GUI support
+
+By default, the image is built without X11 libraries needed to launch Vivado HLS GUI. To add GUI support, pass `--build-arg GUI_SUPPORT=1` to the build command. For example:
+
+```
+docker build --network=host -t hls4ml-with-vivado --build-arg LICENSE_SERVER="1234@myserver" --build-arg GUI_SUPPORT=1 .
+```
+
+To launch GUI apps in Docker container, map `/tmp/.X11-unix` and `DISPLAY` environment variable from host to the container, e.g.,
+
+```
+docker run -it -e DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix hls4ml-with-vivado
+```
+
+If your X11 session requires a valid user, `Xauthority` file must be mapped into the container. This file is either in the user's home directory (`$HOME/.Xauthority`) or its location is spcified in the `XAUTHORITY` environment variable. For example:
+
+```
+docker run -it -e DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix -v $HOME/.Xauthority:/home/hls4ml/.Xauthority hls4ml-with-vivado
+```
+
+## Customizing the default user
+
+Default user (named *hls4ml*) cah have its *id* and *group* changed to match a specific user on host machine with `USER_ID` and `GROUP_ID` build arguments. Useful if you want to add a shared volume. For example:
+
+```
+docker build --network=host -t hls4ml-with-vivado --build-arg LICENSE_SERVER="1234@myserver" --build-arg USER_ID=`id -u` --build-arg GROUP_ID=`id -g` .
+```
+
diff --git a/test/docker/install_config-2017.2.txt b/test/docker/install_config-2017.2.txt
new file mode 100644
index 0000000000..58e7efc258
--- /dev/null
+++ b/test/docker/install_config-2017.2.txt
@@ -0,0 +1,28 @@
+#### Vivado HL Design Edition Install Configuration ####
+Edition=Vivado HL Design Edition
+
+# Path where Xilinx software will be installed.
+Destination=/opt/Xilinx
+
+# Choose the Products/Devices the you would like to install.
+Modules=Zynq UltraScale+ MPSoC:0,Software Development Kit (SDK):0,DocNav:0,Kintex UltraScale:1,Engineering Sample Devices:0,Kintex-7:1,Virtex UltraScale+:0,Zynq-7000:0,Kintex UltraScale+ ES:0,Kintex UltraScale+:0,Spartan-7:0,Zynq UltraScale+ RFSoC ES:0,Virtex-7:1,Virtex UltraScale:1,Virtex UltraScale+ ES:0,Zynq UltraScale+ MPSoC ES:0,Artix-7:0
+
+# Choose the post install scripts you'd like to run as part of the finalization step. Please note that some of these scripts may require user interaction during runtime.
+InstallOptions=Acquire or Manage a License Key:0,Enable WebTalk for SDK to send usage statistics to Xilinx:1,Enable WebTalk for Vivado to send usage statistics to Xilinx (Always enabled for WebPACK license):1
+
+## Shortcuts and File associations ##
+# Choose whether Start menu/Application menu shortcuts will be created or not.
+CreateProgramGroupShortcuts=1
+
+# Choose the name of the Start menu/Application menu shortcut. This setting will be ignored if you choose NOT to create shortcuts.
+ProgramGroupFolder=Xilinx Design Tools
+
+# Choose whether shortcuts will be created for All users or just the Current user. Shortcuts can be created for all users only if you run the installer as administrator.
+CreateShortcutsForAllUsers=0
+
+# Choose whether shortcuts will be created on the desktop or not.
+CreateDesktopShortcuts=0
+
+# Choose whether file associations will be created or not.
+CreateFileAssociation=0
+
diff --git a/test/docker/install_config.txt b/test/docker/install_config.txt
new file mode 100644
index 0000000000..43e2e085b6
--- /dev/null
+++ b/test/docker/install_config.txt
@@ -0,0 +1,28 @@
+#### Vivado HL Design Edition Install Configuration ####
+Edition=Vivado HL Design Edition
+
+# Path where Xilinx software will be installed.
+Destination=/opt/Xilinx
+
+# Choose the Products/Devices the you would like to install.
+Modules=Zynq UltraScale+ MPSoC:0,DocNav:0,Kintex-7:1,Virtex UltraScale+:0,Virtex UltraScale+ HBM ES:0,Zynq-7000:0,Kintex UltraScale+:0,Model Composer:0,ARM Cortex-A53:0,Spartan-7:0,Zynq UltraScale+ RFSoC ES:0,Engineering Sample Devices:0,Kintex UltraScale:1,Virtex UltraScale:1,SDK Core Tools:1,Zynq UltraScale+ RFSoC:0,ARM Cortex-A9:0,ARM Cortex R5:0,Virtex-7:1,Virtex UltraScale+ 58G ES:0,Zynq UltraScale+ MPSoC ES:0,MicroBlaze:0,Artix-7:0
+
+# Choose the post install scripts you'd like to run as part of the finalization step. Please note that some of these scripts may require user interaction during runtime.
+InstallOptions=Acquire or Manage a License Key:0,Enable WebTalk for SDK to send usage statistics to Xilinx:1,Enable WebTalk for Vivado to send usage statistics to Xilinx (Always enabled for WebPACK license):1
+
+## Shortcuts and File associations ##
+# Choose whether Start menu/Application menu shortcuts will be created or not.
+CreateProgramGroupShortcuts=1
+
+# Choose the name of the Start menu/Application menu shortcut. This setting will be ignored if you choose NOT to create shortcuts.
+ProgramGroupFolder=Xilinx Design Tools
+
+# Choose whether shortcuts will be created for All users or just the Current user. Shortcuts can be created for all users only if you run the installer as administrator.
+CreateShortcutsForAllUsers=0
+
+# Choose whether shortcuts will be created on the desktop or not.
+CreateDesktopShortcuts=0
+
+# Choose whether file associations will be created or not.
+CreateFileAssociation=0
+
diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py
new file mode 100644
index 0000000000..58fcfeb7f4
--- /dev/null
+++ b/test/pytest/test_cnn_mnist.py
@@ -0,0 +1,93 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+from sklearn.metrics import accuracy_score
+from tensorflow.keras.datasets import mnist
+from tensorflow.keras.layers import Activation, AveragePooling2D, Conv2D, Dense, Flatten, MaxPooling2D
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.utils import to_categorical
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+
+@pytest.fixture(scope='module')
+def mnist_data():
+    (x_train, y_train), (x_test, y_test) = mnist.load_data()
+    x_train = x_train.astype("float32") / 255.0
+    x_test = x_test.astype("float32") / 255.0
+    x_train = np.expand_dims(x_train, -1)
+    x_test = np.expand_dims(x_test, -1)
+    y_train = to_categorical(y_train, 10)
+    y_test = to_categorical(y_test, 10)
+    x_test, y_test = x_test[:1000], y_test[:1000]
+    return x_train, y_train, x_test, y_test
+
+
+@pytest.fixture(scope='module')
+def keras_model(mnist_data):
+    # Aim of this model is to test different CNN paramaters, including:
+    # The common filter sizes, 3x3 and 5x5
+    # A non-power of 2 number of filters
+    # Both Average and Max Pooling
+    # Both Same and Valid Padding
+    x_train, y_train, x_test, y_test = mnist_data
+    keras_model = Sequential()
+    keras_model.add(Conv2D(4, (3, 3), input_shape=(28, 28, 1), padding='same'))
+    keras_model.add(Activation('relu'))
+    keras_model.add(MaxPooling2D())
+    keras_model.add(Conv2D(6, (5, 5), padding='valid'))
+    keras_model.add(Activation('relu'))
+    keras_model.add(AveragePooling2D())
+    keras_model.add(Flatten())
+    keras_model.add(Dense(10, kernel_initializer='lecun_uniform'))
+    keras_model.add(Activation('softmax', name='softmax'))
+    keras_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
+    keras_model.fit(x_train, y_train, batch_size=32, epochs=5, verbose=0)
+    return keras_model
+
+
+@pytest.mark.parametrize(
+    'backend,io_type,strategy',
+    [
+        ('Quartus', 'io_parallel', 'resource'),
+        ('Quartus', 'io_stream', 'resource'),
+        ('Vivado', 'io_parallel', 'resource'),
+        ('Vivado', 'io_parallel', 'latency'),
+        ('Vivado', 'io_stream', 'latency'),
+        ('Vivado', 'io_stream', 'resource'),
+        ('Vitis', 'io_parallel', 'resource'),
+        ('Vitis', 'io_parallel', 'latency'),
+        ('Vitis', 'io_stream', 'latency'),
+        ('Vitis', 'io_stream', 'resource'),
+    ],
+)
+def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy):
+    x_train, y_train, x_test, y_test = mnist_data
+
+    hls_config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name', backend=backend)
+    hls_config['Model']['Strategy'] = strategy
+    hls_config['LayerName']['softmax']['Implementation'] = 'stable'
+    output_dir = str(test_root_path / f'hls4mlprj_cnn_mnist_{backend}_{io_type}_{strategy}')
+
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        keras_model, hls_config=hls_config, output_dir=output_dir, backend=backend, io_type=io_type
+    )
+    hls_model.compile()
+
+    # Model under test predictions and accuracy
+    y_keras = keras_model.predict(x_test)
+    y_hls4ml = hls_model.predict(x_test)
+
+    acc_keras = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_keras, axis=1))
+    acc_hls4ml = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_hls4ml, axis=1))
+    rel_diff = abs(acc_keras - acc_hls4ml) / acc_keras
+
+    print(f'Accuracy keras:      {acc_keras}')
+    print(f'Accuracy hls4ml:     {acc_hls4ml}')
+    print(f'Relative difference: {rel_diff}')
+
+    assert acc_keras > 0.95 and rel_diff < 0.03
+

From de79bb91fff589188c22bb11e34dd5703e2688b4 Mon Sep 17 00:00:00 2001
From: LostEcho365 <lostecho@uw.edu>
Date: Sun, 12 Nov 2023 12:27:57 -0800
Subject: [PATCH 32/55] updated on hls4ml transformer

---
 hls4ml/converters/keras/qkeras_layers.py      |  2 +-
 .../nnet_utils/nnet_multiheadattention.h      | 36 ++++++++++++++++---
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/hls4ml/converters/keras/qkeras_layers.py b/hls4ml/converters/keras/qkeras_layers.py
index 6a3e3e062e..cd08f6d1bb 100644
--- a/hls4ml/converters/keras/qkeras_layers.py
+++ b/hls4ml/converters/keras/qkeras_layers.py
@@ -101,7 +101,7 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader,
     if activation_config['class_name'] == 'quantized_bits':
         activation_config['class_name'] = 'linear'
     if activation_config['class_name'] == 'quantized_softmax':
-        # activation_config['class_name'] = 'softmax'
+        # activation_config['class_name'] = 'Softmax'
         layer['class_name'] = 'Softmax'
         layer['axis'] = keras_layer['config'].get('axis', -1)
     layer['activation'] = activation_config['class_name'].replace('quantized_', '')
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 4c42c69b67..72d90dfd0a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -16,6 +16,7 @@ struct multiheadattention_config {
     typedef float bias_t;
     typedef float weight_t;
     typedef float accum_t;
+    typedef ap_fixed<16, 8> multi_t;
 
     // Layer Sizes
     static const unsigned num_heads = 10;
@@ -33,7 +34,12 @@ struct multiheadattention_config {
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
 
+
 template <int PackSize, class data_T> struct datapack { data_T data[PackSize]; };
+template<int PackSize, class data_T>
+struct datapack {
+    data_T data[PackSize];
+};
 
 template <class data_T, int size> void read_stream_array(hls::stream<data_T> data_in[size], data_T out[size]) {
     for (int k = 0; k < size; ++k) {
@@ -42,10 +48,32 @@ template <class data_T, int size> void read_stream_array(hls::stream<data_T> dat
     }
 }
 
-template <class data_T, class res_T, typename CONFIG_T>
-void matrixmul_transpose(hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
-                         hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
-                         res_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
+//////////////////
+//Dennis version//
+//////////////////
+//template<int PackSize, class data_T>
+//struct datapack {
+//    typename CONFIG_T::multi_t data[PackSize];
+//};
+//
+//template <class data_T,int size>
+//void read_stream_array(
+//	hls::stream<data_T>    data_in[size],
+//	typename CONFIG_T::multi_t out[size]
+//)
+//{
+//	for (int k=0; k<size; ++k){
+//	#pragma HLS UNROLL
+//		out[k] = data_in[k].read();
+//	}
+//}
+
+
+template<class data_T, class res_T, typename CONFIG_T>
+void matrixmul_transpose(
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
+	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
+    res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
     const data_T dk = 1.0 / sqrt(CONFIG_T::head_dim_key);
     data_T QK_1;

From 6c23326b43a425900f3dec020dd26145f277609b Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Sep 2024 16:16:11 -0400
Subject: [PATCH 33/55] trying to clean the diff

---
 .idea/hls4ml.iml                              |  14 -
 .idea/misc.xml                                |   7 -
 .idea/workspace.xml                           | 164 ----
 contrib/garnet.py                             |   2 +-
 docs/status.rst                               |  44 +-
 hls4ml/.idea/.gitignore                       |   3 -
 hls4ml/.idea/hls4ml.iml                       |  12 -
 .../inspectionProfiles/profiles_settings.xml  |   6 -
 hls4ml/.idea/misc.xml                         |  10 -
 hls4ml/.idea/modules.xml                      |   8 -
 hls4ml/.idea/vcs.xml                          |   6 -
 hls4ml/converters/keras/qkeras_layers.py      | 143 ---
 hls4ml/converters/tf_to_hls.py                |   0
 .../model/optimizer/passes/precision_merge.py |   0
 .../templates/quartus/firmware/defines.h.bak  |  47 -
 .../quartus/firmware/myproject.cpp.bak        |  48 --
 .../quartus/firmware/myproject.h.bak          |  48 --
 .../firmware/nnet_utils/nnet_batchnorm.h.bak  | 104 ---
 .../firmware/nnet_utils/nnet_common.h.bak     |  71 --
 .../firmware/nnet_utils/nnet_conv1d.h.bak     |  64 --
 .../firmware/nnet_utils/nnet_dense.h.bak      | 169 ----
 .../nnet_utils/nnet_dense_compressed.h.bak    |  80 --
 .../firmware/nnet_utils/nnet_helpers.h.bak    | 140 ---
 .../firmware/nnet_utils/nnet_merge.h.bak      | 249 ------
 .../firmware/nnet_utils/nnet_mult.h.bak       | 113 ---
 .../firmware/nnet_utils/nnet_padding.h.bak    |  99 ---
 .../quartus/myproject_test_parallel.cpp.bak   | 112 ---
 .../vivado/firmware/myproject.cpp.bak         |  23 -
 .../templates/vivado/firmware/myproject.h.bak |  19 -
 .../templates/vivado/myproject_test.cpp.bak   |  94 --
 .../vivado/nnet_utils/nnet_activation.h.bak   | 795 -----------------
 .../vivado/nnet_utils/nnet_array.h.bak        |  52 --
 .../vivado/nnet_utils/nnet_batchnorm.h.bak    | 124 ---
 .../nnet_utils/nnet_batchnorm_stream.h.bak    | 123 ---
 .../vivado/nnet_utils/nnet_common.h.bak       |  75 --
 .../vivado/nnet_utils/nnet_conv1d.h.bak       |  66 --
 .../nnet_utils/nnet_conv1d_stream.h.bak       |  89 --
 .../vivado/nnet_utils/nnet_conv2d.h.bak       |  75 --
 .../nnet_utils/nnet_conv2d_latency.h.bak      |  89 --
 .../vivado/nnet_utils/nnet_dense.h.bak        |  60 --
 .../nnet_utils/nnet_dense_compressed.h.bak    |  90 --
 .../nnet_utils/nnet_dense_latency.h.bak       |  72 --
 .../nnet_utils/nnet_dense_resource.h.bak      | 263 ------
 .../vivado/nnet_utils/nnet_dense_seq.h.bak    |  44 -
 .../vivado/nnet_utils/nnet_garnet.h.bak       | 816 ------------------
 .../vivado/nnet_utils/nnet_helpers.h.bak      | 382 --------
 .../vivado/nnet_utils/nnet_layernorm.h.bak    | 404 ---------
 .../vivado/nnet_utils/nnet_merge.h.bak        | 256 ------
 .../vivado/nnet_utils/nnet_merge_stream.h.bak | 370 --------
 .../vivado/nnet_utils/nnet_mult.h.bak         | 116 ---
 .../nnet_utils/nnet_multiheadattention.h.bak  | 337 --------
 .../vivado/nnet_utils/nnet_padding.h.bak      | 145 ----
 .../vivado/nnet_utils/nnet_pooling.h.bak      | 313 -------
 .../nnet_utils/nnet_recr_activations.h.bak    |  56 --
 .../vivado/nnet_utils/nnet_recurrent.h.bak    | 586 -------------
 test/docker/README.md                         |  72 --
 test/docker/install_config-2017.2.txt         |  28 -
 test/docker/install_config.txt                |  28 -
 test/pytest/test_cnn_mnist.py                 |  93 --
 59 files changed, 2 insertions(+), 7916 deletions(-)
 delete mode 100644 .idea/hls4ml.iml
 delete mode 100644 .idea/misc.xml
 delete mode 100644 .idea/workspace.xml
 delete mode 100644 hls4ml/.idea/.gitignore
 delete mode 100644 hls4ml/.idea/hls4ml.iml
 delete mode 100644 hls4ml/.idea/inspectionProfiles/profiles_settings.xml
 delete mode 100644 hls4ml/.idea/misc.xml
 delete mode 100644 hls4ml/.idea/modules.xml
 delete mode 100644 hls4ml/.idea/vcs.xml
 delete mode 100644 hls4ml/converters/keras/qkeras_layers.py
 delete mode 100644 hls4ml/converters/tf_to_hls.py
 delete mode 100644 hls4ml/model/optimizer/passes/precision_merge.py
 delete mode 100644 hls4ml/templates/quartus/firmware/defines.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/myproject.cpp.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/myproject.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak
 delete mode 100644 hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak
 delete mode 100644 hls4ml/templates/quartus/myproject_test_parallel.cpp.bak
 delete mode 100644 hls4ml/templates/vivado/firmware/myproject.cpp.bak
 delete mode 100644 hls4ml/templates/vivado/firmware/myproject.h.bak
 delete mode 100644 hls4ml/templates/vivado/myproject_test.cpp.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak
 delete mode 100644 test/docker/README.md
 delete mode 100644 test/docker/install_config-2017.2.txt
 delete mode 100644 test/docker/install_config.txt
 delete mode 100644 test/pytest/test_cnn_mnist.py

diff --git a/.idea/hls4ml.iml b/.idea/hls4ml.iml
deleted file mode 100644
index 57be99f6ff..0000000000
--- a/.idea/hls4ml.iml
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <orderEntry type="jdk" jdkName="Python 3.9 (transformer_1)" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="PyDocumentationSettings">
-    <option name="format" value="GOOGLE" />
-    <option name="myDocStringFormat" value="Google" />
-  </component>
-  <component name="TestRunnerService">
-    <option name="PROJECT_TEST_RUNNER" value="py.test" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index 3295bcdab3..0000000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (transformer_1)" project-jdk-type="Python SDK" />
-  <component name="PythonCompatibilityInspectionAdvertiser">
-    <option name="version" value="3" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
deleted file mode 100644
index e7025ec2d0..0000000000
--- a/.idea/workspace.xml
+++ /dev/null
@@ -1,164 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ChangeListManager">
-    <list default="true" id="ce2c20e4-283d-4a9b-8c44-a1e3f3041d83" name="Changes" comment="">
-      <change beforePath="$PROJECT_DIR$/.gitignore" beforeDir="false" afterPath="$PROJECT_DIR$/.gitignore" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.gitlab-ci.yml" beforeDir="false" afterPath="$PROJECT_DIR$/.gitlab-ci.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/.gitmodules" beforeDir="false" afterPath="$PROJECT_DIR$/.gitmodules" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/CONTRIBUTING.md" beforeDir="false" afterPath="$PROJECT_DIR$/CONTRIBUTING.md" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/Jenkinsfile" beforeDir="false" afterPath="$PROJECT_DIR$/Jenkinsfile" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/LICENSE" beforeDir="false" afterPath="$PROJECT_DIR$/LICENSE" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/MANIFEST.in" beforeDir="false" afterPath="$PROJECT_DIR$/MANIFEST.in" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/README.md" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/contrib/garnet.py" beforeDir="false" afterPath="$PROJECT_DIR$/contrib/garnet.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/Makefile" beforeDir="false" afterPath="$PROJECT_DIR$/docs/Makefile" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/api/configuration.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/api/configuration.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/api/hls-model.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/api/hls-model.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/api/profiling.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/api/profiling.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/command.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/command.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/concepts.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/concepts.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/conf.py" beforeDir="false" afterPath="$PROJECT_DIR$/docs/conf.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/img/hls4ml_logo.svg" beforeDir="false" afterPath="$PROJECT_DIR$/docs/img/hls4ml_logo.svg" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/img/hls4ml_logo_lightgrey.svg" beforeDir="false" afterPath="$PROJECT_DIR$/docs/img/hls4ml_logo_lightgrey.svg" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/index.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/index.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/reference.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/reference.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/release_notes.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/release_notes.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/setup.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/setup.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/docs/status.rst" beforeDir="false" afterPath="$PROJECT_DIR$/docs/status.rst" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/build_prj.tcl" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/myproject.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/myproject.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/parameters.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/b1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/b2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/w1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/firmware/weights/w2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/myproject.tcl" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv-1layer/myproject_test.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/build_prj.tcl" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/myproject.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/myproject.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/parameters.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/b1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/b2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/w1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/firmware/weights/w2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/myproject.tcl" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/conv2d-1layer/myproject_test.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/demo-conversion/README.md" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/demo-conversion/keras-config.yml" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/build_prj.tcl" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/myproject.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/myproject.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/parameters.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/b1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/b2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/w1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/firmware/weights/w2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/myproject.tcl" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/higgs-1layer/myproject_test.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/build_prj.tcl" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/myproject.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/myproject.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/parameters.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b2_0.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b2_1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b3.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/b4.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w2.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w2_0.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w2_1.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w3.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/firmware/weights/w4.h" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/example-prjs/sublayer/myproject_test.cpp" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/hls4ml/converters/keras/qkeras_layers.py" beforeDir="false" afterPath="$PROJECT_DIR$/hls4ml/converters/keras/qkeras_layers.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/hls4ml/model/layers.py" beforeDir="false" afterPath="$PROJECT_DIR$/hls4ml/model/layers.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/hs_err_pid6927.log" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/scripts/hls4ml" beforeDir="false" afterPath="$PROJECT_DIR$/scripts/hls4ml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/setup.cfg" beforeDir="false" afterPath="$PROJECT_DIR$/setup.cfg" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/setup.py" beforeDir="false" afterPath="$PROJECT_DIR$/setup.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/build-prj.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/build-prj.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/cleanup.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/cleanup.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/compare-reports.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/compare-reports.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/convert-keras-models.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/convert-keras-models.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/convert-onnx-models.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/convert-onnx-models.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/convert-pytorch-models.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/convert-pytorch-models.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/docker/Dockerfile" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/Dockerfile" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/docker/README.md" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/README.md" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/docker/install_config-2017.2.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/install_config-2017.2.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/docker/install_config.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/docker/install_config.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/gather-reports.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/gather-reports.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/hls4ml-keras-test.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/hls4ml-keras-test.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/hls4ml-onnx-test.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/hls4ml-onnx-test.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/hls4ml-pytorch-test.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/hls4ml-pytorch-test.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/keras-models.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/keras-models.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/keras-to-hls.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/keras-to-hls.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/onnx-models.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/onnx-models.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/onnx-to-hls.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/onnx-to-hls.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/ci-template.yml" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/ci-template.yml" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/generate_ci_yaml.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/generate_ci_yaml.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_activations.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_activations.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_batchnorm.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_batchnorm.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_bram_factor.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_bram_factor.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_cnn_mnist.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_cnn_mnist.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_cnn_mnist_qkeras.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_cnn_mnist_qkeras.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_conv1d.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_conv1d.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_embed.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_embed.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_extensions.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_extensions.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_flows.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_flows.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_garnet.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_garnet.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_globalpooling.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_globalpooling.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_graph.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_graph.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_keras_api.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_keras_api.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_keras_h5_loader.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_keras_h5_loader.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_merge.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_merge.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_pointwiseconv.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_pointwiseconv.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_qkeras.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_qkeras.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_rnn.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_rnn.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_sepconv2d.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_sepconv2d.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_softmax.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_softmax.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_softsign.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_softsign.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_trace.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_trace.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_transpose_concat.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_transpose_concat.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_upsampling.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_upsampling.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytest/test_zeropadding.py" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytest/test_zeropadding.py" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytorch-models.txt" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytorch-models.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/test/pytorch-to-hls.sh" beforeDir="false" afterPath="$PROJECT_DIR$/test/pytorch-to-hls.sh" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/vivado.jou" beforeDir="false" />
-      <change beforePath="$PROJECT_DIR$/vivado.log" beforeDir="false" />
-    </list>
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
-  </component>
-  <component name="ProjectId" id="2PtfBHVXd9J95mq9Ogv8OfyMXTT" />
-  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
-  <component name="ProjectViewState">
-    <option name="hideEmptyMiddlePackages" value="true" />
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent">
-    <property name="RunOnceActivity.OpenProjectViewOnStart" value="true" />
-    <property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
-    <property name="last_opened_file_path" value="$PROJECT_DIR$/hls4ml" />
-    <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="ce2c20e4-283d-4a9b-8c44-a1e3f3041d83" name="Changes" comment="" />
-      <created>1684281161699</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1684281161699</updated>
-    </task>
-    <servers />
-  </component>
-</project>
\ No newline at end of file
diff --git a/contrib/garnet.py b/contrib/garnet.py
index 4d8b9096c9..075819e9df 100644
--- a/contrib/garnet.py
+++ b/contrib/garnet.py
@@ -322,7 +322,7 @@ def _setup_transforms(self, n_aggregators, n_filters, n_propagate):
             else:
                 input_feature_transform = NamedDense(p, name=('FLR%d' % it))
                 output_feature_transform = NamedDense(f, name=('Fout%d' % it))
-                # output_activation_transform = keras.layers.Activation(self._output_activation)
+                output_activation_transform = keras.layers.Activation(self._output_activation)
 
             aggregator_distance = NamedDense(a, name=('S%d' % it))
 
diff --git a/docs/status.rst b/docs/status.rst
index 44881c2fb3..dc3a6d8f18 100644
--- a/docs/status.rst
+++ b/docs/status.rst
@@ -4,28 +4,19 @@ Status and Features
 
 Status
 ======
-======
 
 The latest version (built from ``main``) is |version|.
 The stable version (released on PyPI) is |release|.
 See the :ref:`Release Notes` section for a changelog.
-The latest version (built from ``main``) is |version|.
-The stable version (released on PyPI) is |release|.
-See the :ref:`Release Notes` section for a changelog.
 
 
 Features
 ========
 
-A list of supported ML frameworks, HLS backends, and neural network architectures, including a summary table is below.  Dependencies are given in the :doc:`Setup <setup>` page.
 A list of supported ML frameworks, HLS backends, and neural network architectures, including a summary table is below.  Dependencies are given in the :doc:`Setup <setup>` page.
 
-ML framework support:
 ML framework support:
 
-* (Q)Keras
-* PyTorch (limited)
-* (Q)ONNX (in development)
 * (Q)Keras
 * PyTorch (limited)
 * (Q)ONNX (in development)
@@ -39,16 +30,6 @@ Neural network architectures:
 
 HLS backends:
 
-* Vivado HLS
-* Intel HLS
-* Vitis HLS (experimental)
-* Fully connected NN (multilayer perceptron, MLP)
-* Convolutional NN
-* Recurrent NN (LSTM)
-* Graph NN (GarNet)
-
-HLS backends:
-
 * Vivado HLS
 * Intel HLS
 * Vitis HLS (experimental)
@@ -58,8 +39,6 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
 .. list-table::
    :header-rows: 1
 
-   * - ML framework/HLS backend
-     - (Q)Keras
    * - ML framework/HLS backend
      - (Q)Keras
      - PyTorch
@@ -67,25 +46,16 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - Vivado HLS
      - Intel HLS
      - Vitis HLS
-     - (Q)ONNX
-     - Vivado HLS
-     - Intel HLS
-     - Vitis HLS
    * - MLP
      - ``supported``
      - ``limited``
      - ``in development``
-     - ``limited``
-     - ``in development``
      - ``supported``
      - ``supported``
      - ``experimental``
-   * - CNN
-     - ``experimental``
    * - CNN
      - ``supported``
      - ``limited``
-     - ``limited``
      - ``in development``
      - ``supported``
      - ``supported``
@@ -110,16 +80,6 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - ``N/A``
      - ``N/A``
      - ``N/A``
-     - ``supported``
-     - ``supported``
-     - ``N/A``
-   * - GNN (GarNet)
-     - ``supported``
-     - ``N/A``
-     - ``N/A``
-     - ``N/A``
-     - ``N/A``
-     - ``N/A``
 
 
 Other feature notes:
@@ -127,7 +87,7 @@ Other feature notes:
 * ``hls4ml`` is tested on Linux, and supports
    * Vivado HLS versions 2018.2 to 2020.1
    * Intel HLS versions 20.1 to 21.4
-   * Vitis HLS versions 2020.2 to 2022.2 (experimentally)
+   * Vitis HLS versions 2022.2 to 2024.1
 * Windows and macOS are not supported
 * BDT support has moved to the `Conifer <https://github.com/thesps/conifer>`__ package
 
@@ -136,5 +96,3 @@ Example Models
 
 We also provide and document several example ``hls4ml`` models in `this GitHub repository <https://github.com/fastmachinelearning/example-models>`_, which is included as a submodule.
 You can check it out by doing ``git submodule update --init --recursive`` from the top level directory of ``hls4ml``.
-We also provide and document several example ``hls4ml`` models in `this GitHub repository <https://github.com/fastmachinelearning/example-models>`_, which is included as a submodule.
-You can check it out by doing ``git submodule update --init --recursive`` from the top level directory of ``hls4ml``.
diff --git a/hls4ml/.idea/.gitignore b/hls4ml/.idea/.gitignore
deleted file mode 100644
index eaf91e2ac6..0000000000
--- a/hls4ml/.idea/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
diff --git a/hls4ml/.idea/hls4ml.iml b/hls4ml/.idea/hls4ml.iml
deleted file mode 100644
index 435d23406d..0000000000
--- a/hls4ml/.idea/hls4ml.iml
+++ /dev/null
@@ -1,12 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="transformer_1" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="PyDocumentationSettings">
-    <option name="format" value="PLAIN" />
-    <option name="myDocStringFormat" value="Plain" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/hls4ml/.idea/inspectionProfiles/profiles_settings.xml b/hls4ml/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2da2d..0000000000
--- a/hls4ml/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
\ No newline at end of file
diff --git a/hls4ml/.idea/misc.xml b/hls4ml/.idea/misc.xml
deleted file mode 100644
index c6af3c0bae..0000000000
--- a/hls4ml/.idea/misc.xml
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="transformer_1" project-jdk-type="Python SDK" />
-  <component name="PyPackaging">
-    <option name="earlyReleasesAsUpgrades" value="true" />
-  </component>
-  <component name="PythonCompatibilityInspectionAdvertiser">
-    <option name="version" value="3" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/hls4ml/.idea/modules.xml b/hls4ml/.idea/modules.xml
deleted file mode 100644
index 7cbe9d42dd..0000000000
--- a/hls4ml/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/hls4ml.iml" filepath="$PROJECT_DIR$/.idea/hls4ml.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
diff --git a/hls4ml/.idea/vcs.xml b/hls4ml/.idea/vcs.xml
deleted file mode 100644
index 2e3f6920d0..0000000000
--- a/hls4ml/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$/.." vcs="Git" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/hls4ml/converters/keras/qkeras_layers.py b/hls4ml/converters/keras/qkeras_layers.py
deleted file mode 100644
index cd08f6d1bb..0000000000
--- a/hls4ml/converters/keras/qkeras_layers.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from hls4ml.converters.keras_to_hls import parse_default_keras_layer
-from hls4ml.converters.keras_to_hls import keras_handler
-
-from hls4ml.converters.keras.core import parse_dense_layer
-from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_layernorm_layer
-from hls4ml.converters.keras.convolution import parse_conv1d_layer
-from hls4ml.converters.keras.convolution import parse_conv2d_layer
-from hls4ml.converters.keras.qkeras import *
-from hls4ml.converters.keras.multiheadattention import parse_mutiheadattention_layer
-
-import tensorflow as tf
-
-
-@keras_handler('QDense')
-def parse_qdense_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    layer, output_shape = parse_dense_layer(keras_layer, input_names, input_shapes, data_reader, config)
-
-    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
-    if keras_layer['config']['bias_quantizer'] is not None:
-        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
-    else:
-        layer['bias_quantizer'] = None
-
-    return layer, output_shape
-
-
-@keras_handler('QMultiHeadAttention')
-def parse_qmultiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    assert('QMultiHeadAttention' in keras_layer['class_name'])
-    assert (input_shapes[0] == keras_layer['config']['query_shape'])
-
-    layer, output_shape = parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config)
-
-    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
-    if keras_layer['config']['bias_quantizer'] is not None:
-        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
-    else:
-        layer['bias_quantizer'] = None
-
-    return layer, output_shape
-
-@keras_handler('QConv1D', 'QConv2D')
-def parse_qconv_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    assert ('QConv' in keras_layer['class_name'])
-
-    if '1D' in keras_layer['class_name']:
-        layer, output_shape = parse_conv1d_layer(keras_layer, input_names, input_shapes, data_reader, config)
-    elif '2D' in keras_layer['class_name']:
-        layer, output_shape = parse_conv2d_layer(keras_layer, input_names, input_shapes, data_reader, config)
-
-    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
-    if keras_layer['config']['bias_quantizer'] is not None:
-        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
-    else:
-        layer['bias_quantizer'] = None
-
-    return layer, output_shape
-
-
-@keras_handler('QActivation')
-def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    assert (keras_layer['class_name'] == 'QActivation')
-    supported_activations = ['quantized_relu', 'quantized_tanh', 'binary_tanh', 'ternary_tanh', 'quantized_bits',
-                             'quantized_softmax', 'binary', 'ternary']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    activation_config = keras_layer['config']['activation']
-    quantizer_obj = get_quantizer(activation_config)
-    activation_config = {}
-    # some activations are classes 
-    if hasattr(quantizer_obj, 'get_config'):
-        activation_config['class_name'] = quantizer_obj.__class__.__name__
-        if activation_config['class_name'] == 'ternary' or activation_config['class_name'] == 'binary':
-            activation_config['class_name'] += '_tanh'
-        activation_config['config'] = quantizer_obj.get_config()
-    # some activation quantizers are just functions with no config
-    else:
-        activation_config['config'] = {}
-        if 'binary' in quantizer_obj.__name__:
-            activation_config['class_name'] = 'binary_tanh'
-            activation_config['config']['bits'] = 1
-            activation_config['config']['integer'] = 1
-        elif 'ternary' in quantizer_obj.__name__:
-            activation_config['class_name'] = 'ternary_tanh'
-            activation_config['config']['bits'] = 2
-            activation_config['config']['integer'] = 2
-        else:
-            activation_config['class_name'] = 'unknown'
-
-    if activation_config['class_name'] not in supported_activations:
-        raise Exception('Unsupported QKeras activation: {}'.format(activation_config['class_name']))
-
-    if activation_config['class_name'] == 'ternary_tanh':
-        layer['class_name'] = 'TernaryTanh'
-        layer['threshold'] = activation_config.get('config', {}).get('threshold', 0.33)
-        if layer['threshold'] is None:
-            layer['threshold'] = 0.33  # the default ternary tanh threshold for QKeras
-    else:
-        layer['class_name'] = 'Activation'
-    if activation_config['class_name'] == 'quantized_bits':
-        activation_config['class_name'] = 'linear'
-    if activation_config['class_name'] == 'quantized_softmax':
-        # activation_config['class_name'] = 'Softmax'
-        layer['class_name'] = 'Softmax'
-        layer['axis'] = keras_layer['config'].get('axis', -1)
-    layer['activation'] = activation_config['class_name'].replace('quantized_', '')
-    return layer, [shape for shape in input_shapes[0]]
-
-
-@keras_handler('QBatchNormalization')
-def parse_qbatchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    layer, output_shape = parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config)
-
-    layer['mean_quantizer'] = get_quantizer_from_config(keras_layer, 'mean')
-    layer['variance_quantizer'] = get_quantizer_from_config(keras_layer, 'variance')
-    layer['beta_quantizer'] = get_quantizer_from_config(keras_layer, 'beta')
-    layer['gamma_quantizer'] = get_quantizer_from_config(keras_layer, 'gamma')
-
-    return layer, output_shape
-
-
-@keras_handler('QLayerNormalization')
-def parse_qlayernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    layer, output_shape = parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config)
-
-    layer['mean_quantizer'] = get_quantizer_from_config(keras_layer, 'mean')
-    layer['variance_quantizer'] = get_quantizer_from_config(keras_layer, 'variance')
-    layer['beta_quantizer'] = get_quantizer_from_config(keras_layer, 'beta')
-    layer['gamma_quantizer'] = get_quantizer_from_config(keras_layer, 'gamma')
-
-    return layer, output_shape
-
-
-@keras_handler('QConv2DBatchnorm')
-def parse_qconv2dbatchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    intermediate_shape = list()
-    conv_layer, shape_qconv = parse_qconv_layer(keras_layer, input_names, input_shapes, data_reader, config)
-    intermediate_shape.append(shape_qconv)
-    temp_shape = intermediate_shape
-    batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader, config)
-    return {**conv_layer, **batch_layer}, out_shape
-
diff --git a/hls4ml/converters/tf_to_hls.py b/hls4ml/converters/tf_to_hls.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/hls4ml/model/optimizer/passes/precision_merge.py b/hls4ml/model/optimizer/passes/precision_merge.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/hls4ml/templates/quartus/firmware/defines.h.bak b/hls4ml/templates/quartus/firmware/defines.h.bak
deleted file mode 100644
index 49781dc963..0000000000
--- a/hls4ml/templates/quartus/firmware/defines.h.bak
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef DEFINES_H_
-#define DEFINES_H_
-
-/*
- * Intel HLS makes use of three streaming interfaces:
- *   (1) stream_in - used as the main input to a component
- *   (2) stream_out - used as the main output of a component
- *   (3) stream - allows both reading and writing; used for inter-component connections
- * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
- * Therefore, variables of type 'stream' are always passed by reference
- */
-
-#ifndef __INTELFPGA_COMPILER__
-
-#include "ac_fixed.h"
-#include "ac_int.h"
-#define hls_register
-
-#include "stream.h"
-template <typename T> using stream = nnet::stream<T>;
-template <typename T> using stream_in = nnet::stream<T>;
-template <typename T> using stream_out = nnet::stream<T>;
-
-#else
-
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/hls.h"
-
-template <typename T> using stream = ihc::stream<T>;
-template <typename T> using stream_in = ihc::stream_in<T>;
-template <typename T> using stream_out = ihc::stream_out<T>;
-
-#endif
-
-// Include nnet::array - a custom array-like struct, mainly used with io_stream
-#include "nnet_utils/nnet_types.h"
-
-// hls-fpga-machine-learning insert numbers
-
-// hls-fpga-machine-learning insert layer-precision
-
-#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
-#define MIN(n, d) (n > d ? d : n)
-#define MAX(n, d) (n < d ? d : n)
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/myproject.cpp.bak b/hls4ml/templates/quartus/firmware/myproject.cpp.bak
deleted file mode 100644
index 3f5749d611..0000000000
--- a/hls4ml/templates/quartus/firmware/myproject.cpp.bak
+++ /dev/null
@@ -1,48 +0,0 @@
-#include "myproject.h"
-#include "parameters.h"
-
-// hls-fpga-machine-learning insert weights
-
-/*
- * Intel HLS requires that all 'stream' types are:
- *     (1) Passed by reference to the top-level entity or
- *     (2) Declared as global variables, outside of the main function
- * Therefore, layer inputs/output (connections betweenn individual layers) are declared here
- */
-// hls-fpga-machine-learning insert inter-task streams
-
-#ifndef __INTELFPGA_COMPILER__
-/*
-* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
-* An important distinction is made between io_stream and io_parallel:
-*     (1) io_parallel:
-               - Top-level function takes a struct containing an array as function argument
-               - Returns a struct containing an array - the prediction
-      (2) io_stream:
-               - Top-level function is 'void' - no return value
-               - Instead, both the input and output are passed by reference
-               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
-* This distinction is handled in quartus_writer.py
-*/
-// hls-fpga-machine-learning instantiate GCC top-level
-#else
-// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-// hls-fpga-machine-learning insert cpragmas
-
-/*
- * The top-level function used during HLS Synthesis goes here
- * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
- */
-// hls-fpga-machine-learning instantiate HLS top-level
-#endif
-// If using io_parallel, the output needs to be initialised and returned at the end of this function
-// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
-// hls-fpga-machine-learning initialize input/output
-
-// ****************************************
-// NETWORK INSTANTIATION
-// ****************************************
-
-// hls-fpga-machine-learning insert layers
-
-// hls-fpga-machine-learning return
diff --git a/hls4ml/templates/quartus/firmware/myproject.h.bak b/hls4ml/templates/quartus/firmware/myproject.h.bak
deleted file mode 100644
index afb7020671..0000000000
--- a/hls4ml/templates/quartus/firmware/myproject.h.bak
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef MYPROJECT_H_
-#define MYPROJECT_H_
-
-#ifndef __INTELFPGA_COMPILER__
-#include "ac_fixed.h"
-#include "ac_int.h"
-#define hls_register
-#else
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/hls.h"
-#endif
-
-// Streams are explicitly defined in defines.h, which are included for parameters.h
-// Defining them again in this file will cause compile-time errors
-#include "defines.h"
-
-// If using io_parallel, inputs and output need to be initialised before calling the top-level function
-// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
-// hls-fpga-machine-learning insert inputs
-// hls-fpga-machine-learning insert outputs
-
-#ifndef __INTELFPGA_COMPILER__
-/*
-* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
-* An important distinction is made between io_stream and io_parallel:
-*     (1) io_parallel:
-               - Top-level function takes a struct containing an array as function argument
-               - Returns a struct containing an array - the prediction
-      (2) io_stream:
-               - Top-level function is 'void' - no return value
-               - Instead, both the input and output are passed by reference
-               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
-* This distinction is handled in quartus_writer.py
-*/
-// hls-fpga-machine-learning instantiate GCC top-level
-#else
-// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-// hls-fpga-machine-learning insert cpragmas
-
-/*
- * The top-level function used during HLS Synthesis goes here
- * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
- */
-// hls-fpga-machine-learning instantiate HLS top-level
-#endif
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak
deleted file mode 100644
index f8c4ae7c64..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h.bak
+++ /dev/null
@@ -1,104 +0,0 @@
-#ifndef NNET_BATCHNORM_H_
-#define NNET_BATCHNORM_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct batchnorm_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-
-    // Default multiplication
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-// Calcuate result
-Result:
-    //#pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
-        if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
-                        bias[ires];
-        } else {
-            int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
-                bias[norm_index];
-        }
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-struct batchnorm_quantized_tanh_config {
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-};
-
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
-                           const data_T threshold[CONFIG_T::n_scale_bias]) {
-    //#pragma unroll
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        ac_int<1, false> cache;
-        data_T datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg >= threshold[norm_index])
-            cache = 1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
-                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
-                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
-    //#pragma unroll
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        ac_int<2, true> cache;
-        data_T datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg > threshold_hi[norm_index])
-            cache = 1;
-        else if (datareg <= threshold_lo[norm_index])
-            cache = -1;
-        else
-            cache = 0;
-        res[ii] = cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak
deleted file mode 100644
index 6973e51a76..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h.bak
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef NNET_COMMON_H_
-#define NNET_COMMON_H_
-
-#ifndef __INTELFPGA_COMPILER__
-#include "ac_fixed.h"
-#include "ac_int.h"
-#include "math.h"
-#else
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/math.h"
-#endif
-
-#include "nnet_helpers.h"
-
-typedef ac_fixed<16, 6> table_default_t;
-
-namespace nnet {
-
-// Common type definitions
-enum io_type { io_parallel = 0, io_stream };
-
-// Default data types (??) TODO: Deprecate
-typedef ac_fixed<16, 4> weight_t_def;
-typedef ac_fixed<16, 4> bias_t_def;
-typedef ac_fixed<32, 10> accum_t_def;
-
-template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
-    //#pragma unroll
-    for (int ii = 0; ii < NIN1; ii++) {
-        res[ii] = data1[ii];
-    }
-    //#pragma unroll
-    for (int ii = 0; ii < NIN2; ii++) {
-        res[NIN1 + ii] = data2[ii];
-    }
-}
-
-/* ---
- * Balanced tree reduce implementation.
- * For use in scenarios where Quartus cannot expression balance
- * Reduces an array of inputs to a single value using the template binary operator 'Op',
- * for example summing all elements with Op_add, or finding the maximum with Op_max
- * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
- * before applying and accumulate the result over the rolled dimension.
- * --- */
-template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
-    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
-    if (N == 1) {
-        return x[0];
-    }
-    if (N == 2) {
-        return op(x[0], x[1]);
-    }
-    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
-}
-
-template <class T> class Op_add {
-  public:
-    T operator()(T a, T b) { return a + b; }
-};
-
-template <class T> class Op_max {
-  public:
-    T operator()(T a, T b) { return a >= b ? a : b; }
-};
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak
deleted file mode 100644
index 579606519f..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h.bak
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef NNET_CONV1D_H_
-#define NNET_CONV1D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv1d_resource.h"
-
-namespace nnet {
-
-struct conv1d_config {
-    // I/O sizes
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-
-    // Number of channels, filters
-    static const unsigned n_chan = 1;
-    static const unsigned n_filt = 1;
-
-    // Original filter size
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_width;
-
-    // Modified filter size (post-Wionograd transformation, if applied)
-    static const unsigned impl_filt_height = 1;
-    static const unsigned impl_filt_width = 1;
-
-    // Padding, stride, dilation
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned stride_width = 1;
-    static const unsigned dilation = 1;
-
-    // Run-time Configuration
-    static const unsigned n_zeros = 0;
-    static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
-
-    // TODO: BRAM Storage on Quartus
-    static const bool store_weights_in_bram = false;
-
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak
deleted file mode 100644
index 99187814ec..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h.bak
+++ /dev/null
@@ -1,169 +0,0 @@
-#ifndef NNET_DENSE_LARGE_H_
-#define NNET_DENSE_LARGE_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct dense_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 10;
-
-    static const unsigned reuse_factor = 1;
-    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
-    static const unsigned multiplier_factor = 1; // min n_in, rf
-    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
-    static const unsigned reciprocal = 1;        // 2^35 / 25
-    static const unsigned rf_pad = 0;
-    static const unsigned bf_pad = 0;
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-
-    // Default multiplication
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
-           "The current Reuse Factor is not allowed");
-    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
-    ////#pragma ii CONFIG_T::reuse_factor
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-Load:
-    //#pragma unroll
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
-    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
-
-    //#pragma unroll
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + CONFIG_T::reuse_factor * im;
-            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor).to_int();
-            d_index[ir][im] = w_index % CONFIG_T::n_in;
-        }
-    }
-Product1:
-    //#pragma nofusion
-    //#pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
-    Product2:
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
-            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
-                continue;
-            int data_index = d_index[ir][im];
-            // Modified this
-            tmp_acc[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
-        }
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
-    ResetMult:
-        //#pragma unroll
-        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
-            mult[imult] = 0;
-        }
-    AccumLoop1:
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            int o_index = out_index[ir][im];
-            if (o_index >= CONFIG_T::n_out)
-                continue; // check out of bounds
-            mult[o_index] += tmp_acc[im];
-        }
-    AccumLoop2:
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
-            acc[im] += mult[im];
-        }
-    }
-Store:
-    //#pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
-    }
-}
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
-           "The current Reuse Factor is not allowed");
-    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
-
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-InitAccum:
-    //#pragma unroll
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-ReuseLoop:
-    //#pragma nofusion
-    //#pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
-    MultLoop:
-        //#pragma unroll
-        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
-            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
-                continue;
-            // Modified this
-            mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
-            in_index += CONFIG_T::reuse_factor;
-            if (in_index >= CONFIG_T::n_in)
-                in_index = ir;
-        }
-    AccumLoop:
-        //#pragma unroll
-        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
-            acc[out_index] += mult[im];
-            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
-                acc_step = 0;
-                out_index++;
-            } else {
-                acc_step++;
-            }
-        }
-    }
-// Cast to "res_t" type
-Result:
-    //#pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(
-    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-} // namespace nnet
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak
deleted file mode 100644
index dcda87d316..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h.bak
+++ /dev/null
@@ -1,80 +0,0 @@
-#ifndef NNET_COMPRESSED_LAYER_H_
-#define NNET_COMPRESSED_LAYER_H_
-
-#include "nnet_common.h"
-#include "nnet_dense.h"
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
-                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-
-InitAccum:
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_out; i++) {
-        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
-    }
-
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-
-    //#pragma unroll
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32 w = ir + CONFIG_T::reuse_factor * im;
-            inputs[ir][im] = data[weights[w].row_index];
-            out_index[ir][im] = weights[w].col_index;
-        }
-    }
-ReuseLoop:
-    //#pragma nofusion
-    //#pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
-    CompressedMultLoop:
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32 w = ir + CONFIG_T::reuse_factor * im;
-            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
-            typename CONFIG_T::accum_t prod = mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
-            //#pragma unroll
-            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
-                inputs[is][im] = inputs[is + 1][im];
-            }
-        }
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
-    ResetMult:
-        //#pragma unroll
-        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
-            tmp_acc[tacc] = 0;
-        }
-    AccumLoop1:
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            int col = out_index[ir][im];
-            tmp_acc[col] += mult[im];
-        }
-    AccumLoop2:
-        //#pragma unroll
-        for (int im = 0; im < CONFIG_T::n_out; im++) {
-            acc[im] += tmp_acc[im];
-        }
-    }
-
-// Cast to "res_t" type
-ResultLoop:
-    //#pragma unroll
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak
deleted file mode 100644
index 775303e267..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h.bak
+++ /dev/null
@@ -1,140 +0,0 @@
-#ifndef NNET_HELPERS_H
-#define NNET_HELPERS_H
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <sstream>
-#include <stdio.h>
-#include <stdlib.h>
-
-namespace nnet {
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = dstType(src[i]);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = static_cast<dstType>(src[i].to_double());
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, stream_in<dstType> &dst) {
-    for (size_t i = 0; i < SIZE / dstType::size; i++) {
-        dstType ctype;
-        for (size_t j = 0; j < dstType::size; j++) {
-            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
-        }
-        dst.write(ctype);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data_back(stream_out<srcType> &src, dstType *dst) {
-    for (size_t i = 0; i < SIZE / srcType::size; i++) {
-        srcType ctype = src.read();
-        for (size_t j = 0; j < srcType::size; j++) {
-            dst[i * srcType::size + j] = dstType(ctype[j].to_double());
-        }
-    }
-}
-
-extern bool trace_enabled;
-extern std::map<std::string, void *> *trace_outputs;
-extern size_t trace_type_size;
-
-constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
-
-constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
-
-constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
-
-template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
-    for (int i = 0; i < layer_size; i++) {
-        ptr[i] = static_cast<save_T>(data[i].to_double());
-    }
-}
-
-template <class data_T, class save_T> void save_output_array(stream<data_T> &data, save_T *ptr, size_t layer_size) {
-    for (size_t i = 0; i < layer_size / data_T::size; i++) {
-        data_T ctype = data.read();
-        for (size_t j = 0; j < data_T::size; j++) {
-            ptr[i * data_T::size + j] = static_cast<save_T>(ctype[j].to_double());
-        }
-        data.write(ctype);
-    }
-}
-
-// We don't want to include save_T in this function because it will be inserted into myproject.cpp
-// so a workaround with element size is used
-template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (int i = 0; i < layer_size; i++) {
-            out << data[i] << " "; // We don't care about precision in text files
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-template <class data_T> void save_layer_output(stream<data_T> &data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (size_t i = 0; i < layer_size / data_T::size; i++) {
-            data_T ctype = data.read();
-            for (size_t j = 0; j < data_T::size; j++) {
-                out << ctype[j] << " ";
-            }
-            data.write(ctype);
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak
deleted file mode 100644
index b24f56dc18..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h.bak
+++ /dev/null
@@ -1,249 +0,0 @@
-#ifndef NNET_MERGE_H_
-#define NNET_MERGE_H_
-
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct merge_config {
-    static const unsigned n_elem = 10;
-};
-
-struct dot_config {
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 1;
-
-    static const unsigned reuse_factor = 1;
-
-    typedef float accum_t;
-
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-struct concat_config {
-    static const unsigned n_elem1_0 = 10;
-    static const unsigned n_elem1_1 = 10;
-    static const unsigned n_elem1_2 = 10;
-    static const unsigned n_elem2_0 = 10;
-    static const unsigned n_elem2_1 = 10;
-    static const unsigned n_elem2_2 = 10;
-
-    static const unsigned axis = -1;
-};
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] + data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] - data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] * data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-
-    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
-Product:
-    //#pragma unroll multiplier_limit
-    for (int i = 0; i < CONFIG_T::n_in; i++) {
-        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
-    }
-
-    hls_register typename CONFIG_T::accum_t acc = 0;
-Accum:
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_in; i++) {
-        acc += mult[i];
-    }
-
-    res[0] = static_cast<res_T>(acc);
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
-                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        //#pragma unroll
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
-                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
-        }
-
-        //#pragma unroll
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
-                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    //#pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx =
-                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
-            }
-        }
-
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
-                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
-                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
-                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
-                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
-            }
-
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
-                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak
deleted file mode 100644
index 085fabf99f..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h.bak
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef NNET_MULT_H_
-#define NNET_MULT_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <math.h>
-
-namespace nnet {
-
-//  Different methods to perform the product of input and weight, depending on their types.
-namespace product {
-
-class Product {
-  public:
-    static void limit(unsigned multiplier_limit) {}
-};
-
-template <class x_T, class w_T> class both_binary : public Product {
-  public:
-    inline static x_T product(x_T a, w_T w) {
-        // specialisation for 1-bit weights and incoming data
-        return a == w;
-    }
-};
-
-template <class x_T, class w_T> class weight_binary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 1-bit weights, arbitrary data
-        if (w == 0)
-            return -a;
-        else
-            return a;
-    }
-};
-
-template <class x_T, class w_T> class data_binary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-w) {
-        // Specialisation for 1-bit data, arbitrary weight
-        if (a == 0)
-            return -w;
-        else
-            return w;
-    }
-};
-
-template <class x_T, class w_T> class weight_ternary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 2-bit weights, arbitrary data
-        if (w == 0)
-            return 0;
-        else if (w == -1)
-            return -a;
-        else
-            return a; // if(w == 1)
-    }
-};
-
-template <class x_T, class w_T> class mult : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
-        // 'Normal' product
-        return a * w;
-    }
-    static void limit(unsigned multiplier_limit) {
-        // TODO: Implement for Quartus
-        // //#pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
-        // pragma
-    }
-};
-
-template <class x_T, class w_T> class weight_exponential : public Product {
-  public:
-    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
-    inline static r_T product(x_T a, w_T w) {
-        // Shift product for exponential weights
-        // Shift by the exponent. Negative weights shift right
-        r_T y = static_cast<r_T>(a) << w.weight;
-
-        // Negate or not depending on weight sign
-        return w.sign == 1 ? y : static_cast<r_T>(-y);
-    }
-};
-} // namespace product
-
-// TO-DO: These may need extra variants if ac_int types are used in more places
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
-                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
-                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
-                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
-                               res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<res_T>(x);
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<res_T>(x);
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak
deleted file mode 100644
index 7e3fa9e55a..0000000000
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h.bak
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef NNET_PADDING_H_
-#define NNET_PADDING_H_
-
-namespace nnet {
-
-struct padding1d_config {
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-    static const unsigned n_chan = 10;
-
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    for (int i = 0; i < CONFIG_T::pad_left; i++) {
-        //#pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_width; i++) {
-        //#pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = (res_T) * (data++);
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_right; i++) {
-        //#pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-struct padding2d_config {
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-
-    static const unsigned n_chan = 10;
-
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    for (int i = 0; i < CONFIG_T::pad_top; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_height; i++) {
-        for (int j = 0; j < CONFIG_T::pad_left; j++) {
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-        for (int j = 0; j < CONFIG_T::in_width; j++) {
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = (res_T) * (data++);
-            }
-        }
-        for (int j = 0; j < CONFIG_T::pad_right; j++) {
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            //#pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/quartus/myproject_test_parallel.cpp.bak b/hls4ml/templates/quartus/myproject_test_parallel.cpp.bak
deleted file mode 100644
index 4de819eb49..0000000000
--- a/hls4ml/templates/quartus/myproject_test_parallel.cpp.bak
+++ /dev/null
@@ -1,112 +0,0 @@
-#include <algorithm>
-#include <cctype>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "firmware/myproject.h"
-#include "firmware/parameters.h"
-
-// hls-fpga-machine-learning insert bram
-
-#define CHECKPOINT 5000
-
-// This function is written to avoid stringstream, which is
-// not supported in cosim 20.1, and because strtok
-// requires a const_cast or allocation to use with std::strings.
-// This function returns the next float (by argument) at position pos,
-// updating pos. True is returned if conversion done, false if the string
-// has ended, and std::invalid_argument exception if the sting was bad.
-bool nextToken(const std::string &str, std::size_t &pos, float &val) {
-    while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
-        pos++;
-    }
-    if (pos >= str.size()) {
-        return false;
-    }
-    std::size_t offset = 0;
-    val = std::stof(str.substr(pos), &offset);
-    pos += offset;
-    return true;
-}
-
-int main(int argc, char **argv) {
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
-    std::string RESULTS_LOG = "tb_data/results.log";
-    std::ofstream fout(RESULTS_LOG);
-
-    std::string iline;
-    std::string pline;
-
-    std::vector<input_data> inputs;
-    std::vector<output_data> outputs;
-
-    if (fin.is_open() && fpr.is_open()) {
-        std::vector<std::vector<float>> predictions;
-        unsigned int num_iterations = 0;
-        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
-            if (num_iterations % CHECKPOINT == 0) {
-                std::cout << "Processing input " << num_iterations << std::endl;
-            }
-
-            std::vector<float> in;
-            std::vector<float> pr;
-            float current;
-
-            std::size_t pos = 0;
-            while (nextToken(iline, pos, current)) {
-                in.push_back(current);
-            }
-
-            pos = 0;
-            while (nextToken(pline, pos, current)) {
-                pr.push_back(current);
-            }
-
-            // hls-fpga-machine-learning insert data
-            predictions.push_back(std::move(pr));
-        }
-
-        // Do this separately to avoid vector reallocation
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert run
-
-        for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert tb-output
-            if (j % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-            }
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        const unsigned int num_iterations = 10;
-        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
-                  << " invocations." << std::endl;
-        // hls-fpga-machine-learning insert zero
-
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert run
-
-        for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert output
-
-            // hls-fpga-machine-learning insert tb-output
-        }
-    }
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
-    return 0;
-}
diff --git a/hls4ml/templates/vivado/firmware/myproject.cpp.bak b/hls4ml/templates/vivado/firmware/myproject.cpp.bak
deleted file mode 100644
index 74b58c5cb1..0000000000
--- a/hls4ml/templates/vivado/firmware/myproject.cpp.bak
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <iostream>
-
-#include "myproject.h"
-#include "parameters.h"
-
-// hls-fpga-machine-learning insert namespace-start
-
-void myproject(
-    // hls-fpga-machine-learning insert header
-) {
-
-    // hls-fpga-machine-learning insert IO
-
-    // hls-fpga-machine-learning insert load weights
-
-    // ****************************************
-    // NETWORK INSTANTIATION
-    // ****************************************
-
-    // hls-fpga-machine-learning insert layers
-}
-
-// hls-fpga-machine-learning insert namespace-end
diff --git a/hls4ml/templates/vivado/firmware/myproject.h.bak b/hls4ml/templates/vivado/firmware/myproject.h.bak
deleted file mode 100644
index a56778976b..0000000000
--- a/hls4ml/templates/vivado/firmware/myproject.h.bak
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef MYPROJECT_H_
-#define MYPROJECT_H_
-
-#include "ap_fixed.h"
-#include "ap_int.h"
-#include "hls_stream.h"
-
-#include "defines.h"
-
-// hls-fpga-machine-learning insert namespace-start
-
-// Prototype of top level function for C-synthesis
-void myproject(
-    // hls-fpga-machine-learning insert header
-);
-
-// hls-fpga-machine-learning insert namespace-end
-
-#endif
diff --git a/hls4ml/templates/vivado/myproject_test.cpp.bak b/hls4ml/templates/vivado/myproject_test.cpp.bak
deleted file mode 100644
index 29a4c816e5..0000000000
--- a/hls4ml/templates/vivado/myproject_test.cpp.bak
+++ /dev/null
@@ -1,94 +0,0 @@
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
-#include "firmware/myproject.h"
-#include "firmware/nnet_utils/nnet_helpers.h"
-
-// hls-fpga-machine-learning insert bram
-
-#define CHECKPOINT 5000
-
-namespace nnet {
-bool trace_enabled = true;
-std::map<std::string, void *> *trace_outputs = NULL;
-size_t trace_type_size = sizeof(double);
-} // namespace nnet
-
-int main(int argc, char **argv) {
-    // hls-fpga-machine-learning insert namespace
-
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
-#ifdef RTL_SIM
-    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
-#else
-    std::string RESULTS_LOG = "tb_data/csim_results.log";
-#endif
-    std::ofstream fout(RESULTS_LOG);
-
-    std::string iline;
-    std::string pline;
-    int e = 0;
-
-    if (fin.is_open() && fpr.is_open()) {
-        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
-            if (e % CHECKPOINT == 0)
-                std::cout << "Processing input " << e << std::endl;
-            char *cstr = const_cast<char *>(iline.c_str());
-            char *current;
-            std::vector<float> in;
-            current = strtok(cstr, " ");
-            while (current != NULL) {
-                in.push_back(atof(current));
-                current = strtok(NULL, " ");
-            }
-            cstr = const_cast<char *>(pline.c_str());
-            std::vector<float> pr;
-            current = strtok(cstr, " ");
-            while (current != NULL) {
-                pr.push_back(atof(current));
-                current = strtok(NULL, " ");
-            }
-
-            // hls-fpga-machine-learning insert data
-
-            // hls-fpga-machine-learning insert top-level-function
-
-            if (e % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-            }
-            e++;
-
-            // hls-fpga-machine-learning insert tb-output
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
-
-        // hls-fpga-machine-learning insert zero
-
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert output
-
-        // hls-fpga-machine-learning insert tb-output
-    }
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
-    return 0;
-}
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak
deleted file mode 100644
index 1dc96e50a0..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h.bak
+++ /dev/null
@@ -1,795 +0,0 @@
-#ifndef NNET_ACTIVATION_H_
-#define NNET_ACTIVATION_H_
-
-#include "ap_fixed.h"
-#include "nnet_common.h"
-#include <cmath>
-
-namespace nnet {
-
-struct activ_config {
-    // IO size
-    static const unsigned n_in = 10;
-
-    // Internal info
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-
-    // Internal data type definitions
-    typedef ap_fixed<18, 8> table_t;
-};
-
-// *************************************************
-//       LINEAR Activation -- See Issue 53
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void linear(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        res[ii] = data[ii];
-    }
-}
-
-// *************************************************
-//       RELU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T> void relu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            res[ii] = datareg;
-        else
-            res[ii] = 0;
-    }
-}
-
-template <class data_T, class res_T, int MAX_INT, typename CONFIG_T>
-void relu_max(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg < 0)
-            res[ii] = 0;
-        else if (datareg > MAX_INT)
-            res[ii] = MAX_INT;
-        else
-            res[ii] = datareg;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void relu6(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    relu_max<data_T, res_T, 6, CONFIG_T>(data, res);
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void relu1(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    relu_max<data_T, res_T, 1, CONFIG_T>(data, res);
-}
-
-// *************************************************
-//       Sigmoid Activation
-// *************************************************
-inline float sigmoid_fcn_float(float input) { return 1.0 / (1 + std::exp(-input)); }
-
-template <typename CONFIG_T, int N_TABLE> void init_sigmoid_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default logistic sigmoid function:
-    //   result = 1/(1+e^(-x))
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = sigmoid_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t sigmoid_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_sigmoid_table<CONFIG_T, CONFIG_T::table_size>(sigmoid_table);
-        initialized = true;
-    }
-
-    //#pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 16;
-        index = data_round + 8 * CONFIG_T::table_size / 16;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)sigmoid_table[index];
-    }
-}
-
-// *************************************************
-//       Softmax Activation
-// *************************************************
-
-enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
-
-inline float exp_fcn_float(float input) { return std::exp(input); }
-
-template <class data_T, typename CONFIG_T> inline float softmax_real_val_from_idx(unsigned i) {
-    // Treat the index as the top N bits
-    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
-    data_T x(0);
-    x(x.width - 1, x.width - N) = i;
-    return (float)x;
-}
-
-template <class data_T, typename CONFIG_T> inline unsigned softmax_idx_from_real_val(data_T x) {
-    // Slice the top N bits to get an index into the table
-    static constexpr int N = ceillog2(CONFIG_T::table_size); // number of address bits for table
-    ap_uint<N> y = x(x.width - 1, x.width - N);              // slice the top N bits of input
-    return (unsigned)y(N - 1, 0);
-}
-
-template <class data_T, typename CONFIG_T>
-void init_exp_table(typename CONFIG_T::exp_table_t table_out[CONFIG_T::table_size]) {
-    // The template data_T is the data type used to address the table
-    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
-        // Slicing bits for address is going to round towards 0, so take the central value
-        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
-        typename CONFIG_T::exp_table_t exp_x = exp_fcn_float(x);
-        table_out[i] = exp_x;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_size]) {
-    // The template data_T is the data type used to address the table
-    for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
-        float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
-        typename CONFIG_T::inv_table_t inv_x = 1.0 / x;
-        table_out[i] = inv_x;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS pipeline
-    // Initialize the lookup tables
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-
-#endif
-    if (!initialized) {
-        // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<data_T, CONFIG_T>(exp_table);
-        // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
-        initialized = true;
-    }
-
-    // Calculate all the e^x's
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
-    //#pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::exp_table_t exp_sum(0);
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        //#pragma HLS unroll
-        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(data[i]);
-        exp_res[i] = exp_table[x];
-    }
-
-    // Explicitly sum the results with an adder tree.
-    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
-
-    typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        //#pragma HLS unroll
-        res[i] = exp_res[i] * inv_exp_sum;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS pipeline
-    // Initialize the lookup tables
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-
-#endif
-    if (!initialized) {
-        // Note we are exponentiating the inputs, which have type data_T
-        init_exp_table<data_T, CONFIG_T>(exp_table);
-        // Note we are inverting the exponentials, which have type exp_table_t
-        init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
-        initialized = true;
-    }
-
-    // Find the max and compute all delta(x_i, x_max)
-    Op_max<data_T> op_max;
-    data_T x_max = reduce<data_T, CONFIG_T::n_in, Op_max<data_T>>(data, op_max);
-
-    // For the diffs, use the same type as the input but force rounding and saturation
-    ap_fixed<data_T::width, data_T::iwidth, AP_RND, AP_SAT> d_xi_xmax[CONFIG_T::n_in];
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        //#pragma HLS unroll
-        d_xi_xmax[i] = data[i] - x_max;
-    }
-
-    // Calculate all the e^x's
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
-    //#pragma HLS array_partition variable=exp_res complete
-    typename CONFIG_T::exp_table_t exp_sum(0);
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        //#pragma HLS unroll
-        unsigned x = softmax_idx_from_real_val<data_T, CONFIG_T>(d_xi_xmax[i]);
-        exp_res[i] = exp_table[x];
-    }
-
-    // Explicitly sum the results with an adder tree.
-    // Rounding & Saturation mode, which improve accuracy, prevent Vivado from expression balancing
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
-
-    typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
-    for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        //#pragma HLS unroll
-        res[i] = exp_res[i] * inv_exp_sum;
-    }
-}
-
-template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::exp_table_t table_out[N_TABLE]) {
-    float exp_range = (float)CONFIG_T::exp_range;
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::inv_table_t table_out[N_TABLE]) {
-    float inv_range = (float)CONFIG_T::inv_range;
-    // Inversion function:
-    //   result = 1/x
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        float in_val = inv_range * ii / float(N_TABLE);
-        if (in_val > 0.0)
-            table_out[ii] = 1.0 / in_val;
-        else
-            table_out[ii] = 0.0;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS pipeline
-    int exp_range = CONFIG_T::exp_range;
-    int inv_range = CONFIG_T::inv_range;
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
-        init_invert_table_legacy<CONFIG_T, CONFIG_T::table_size>(invert_table);
-        initialized = true;
-    }
-
-    // Index into the lookup table based on data for exponentials
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
-    typename CONFIG_T::exp_table_t exp_diff_res;            // different, independent, fixed point precision
-    typename CONFIG_T::exp_table_t data_cache[CONFIG_T::n_in];
-    int data_round;
-    int index;
-
-    //    std::cout << "input to SM: " << std::endl;              /////
-    //    nnet::print_result<data_T, CONFIG_T::n_in>(data, std::cout);  /////
-    //    std::cout << " " << std::endl;   /////
-
-    //#pragma HLS array_partition variable=data_cache complete
-
-    typename CONFIG_T::accum_t denominator;
-    typename CONFIG_T::inv_table_t deno_inver;
-
-    denominator = 0;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * (CONFIG_T::table_size / (exp_range * 2));
-        // std::cout << " data, round: " << data[ii] << " " << data_round << std::endl;  /////
-        index = data_round + exp_range * (CONFIG_T::table_size / (exp_range * 2));
-        // std::cout << " index: " << index;   /////
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        denominator += exp_table[index];
-        // std::cout << "   denominator " << index << std::endl;   /////
-        // std::cout << "   denominator " << denominator << std::endl;   /////
-        data_cache[ii] = exp_table[index];
-    }
-    // std::cout << "end  " << std::endl;    /////
-
-    // using lookup table for inverse
-    int exp_res_index = denominator * (CONFIG_T::table_size / inv_range);
-
-    // std::cout << " denominator: " << denominator << std::endl;  /////
-    // std::cout << " table_size: " << CONFIG_T::table_size << std::endl;  /////
-    // std::cout << " inv_range: " << inv_range << std::endl;  /////
-    // std::cout << " exp_res_index: " << exp_res_index << std::endl;  /////
-    if (exp_res_index < 0)
-        exp_res_index = 0;
-    if (exp_res_index > CONFIG_T::table_size - 1)
-        exp_res_index = CONFIG_T::table_size - 1;
-    deno_inver = invert_table[exp_res_index];
-    // std::cout << " deno_inver: " << deno_inver << std::endl;  /////
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        res[ii] = (res_T)(data_cache[ii] * deno_inver);
-    }
-
-    //	std::cout << "out SM: " << std::endl;
-    //    nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
-    //    std::cout << " " << std::endl;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS inline
-    switch (CONFIG_T::implementation) {
-    case softmax_implementation::latency:
-        softmax_latency<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    case softmax_implementation::stable:
-        softmax_stable<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    case softmax_implementation::legacy:
-        softmax_legacy<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    case softmax_implementation::argmax:
-        softmax_argmax<data_T, res_T, CONFIG_T>(data, res);
-        break;
-    }
-}
-
-// *************************************************
-//       TanH Activation
-// *************************************************
-template <typename CONFIG_T, int N_TABLE> void init_tanh_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Implement tanh lookup
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -4 to +4)
-        float in_val = 2 * 4.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = tanh(in_val);
-        // std::cout << "Tanh:  Lookup table Index: " <<  ii<< " In Value: " << in_val << " Result: " << real_val <<
-        // std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t tanh_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_tanh_table<CONFIG_T, CONFIG_T::table_size>(tanh_table);
-        initialized = true;
-    }
-
-    //#pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 8;
-        index = data_round + 4 * CONFIG_T::table_size / 8;
-        // std::cout << "Input: "  << data[ii] << " Round: " << data_round << " Index: " << index << std::endl;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)tanh_table[index];
-    }
-}
-
-// *************************************************
-//       UnaryLUT Activation
-// *************************************************
-template <int table_size, class data_T> inline unsigned get_index_unary_lut(data_T x) {
-    // Slice the top N bits to get an index into the table
-    static constexpr int N = ceillog2(table_size);
-    return (unsigned)(x(x.width - 1, 0));
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void unary_lut(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               typename CONFIG_T::table_t table[CONFIG_T::table_size]) {
-    //#pragma HLS function_instantiate variable=table
-    //#pragma HLS ARRAY_PARTITION variable=table
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        //#pragma HLS UNROLL
-        unsigned index = get_index_unary_lut<CONFIG_T::table_size>(data[ii]);
-        res[ii] = (res_T)table[index];
-    }
-}
-
-// *************************************************
-//       Hard sigmoid Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void hard_sigmoid(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        auto datareg = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
-        if (datareg > 1)
-            datareg = 1;
-        else if (datareg < 0)
-            datareg = 0;
-        res[ii] = datareg;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void hard_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    if (CONFIG_T::io_type == io_parallel) {
-        //#pragma HLS PIPELINE
-    }
-
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        auto sigmoid = CONFIG_T::slope * data[ii] + CONFIG_T::shift;
-        if (sigmoid > 1)
-            sigmoid = 1;
-        else if (sigmoid < 0)
-            sigmoid = 0;
-        res[ii] = 2 * sigmoid - 1;
-    }
-}
-
-// *************************************************
-//       Leaky RELU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void leaky_relu(data_T data[CONFIG_T::n_in], data_T alpha, res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            res[ii] = datareg;
-        else
-            res[ii] = alpha * datareg;
-    }
-}
-
-// *************************************************
-//       Thresholded RELU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void thresholded_relu(data_T data[CONFIG_T::n_in], data_T theta, res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > theta)
-            res[ii] = datareg;
-        else
-            res[ii] = 0;
-    }
-}
-
-// *************************************************
-//       Softplus Activation
-// *************************************************
-inline float softplus_fcn_float(float input) { return std::log(std::exp(input) + 1.); }
-
-template <typename CONFIG_T, int N_TABLE> void init_softplus_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default softplus function:
-    //   result = log(exp(x) + 1)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = softplus_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softplus(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t softplus_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_softplus_table<CONFIG_T, CONFIG_T::table_size>(softplus_table);
-        initialized = true;
-    }
-
-    //#pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 16;
-        index = data_round + 8 * CONFIG_T::table_size / 16;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)softplus_table[index];
-    }
-}
-
-// *************************************************
-//       Softsign Activation
-// *************************************************
-inline float softsign_fcn_float(float input) { return input / (std::abs(input) + 1.); }
-
-template <typename CONFIG_T, int N_TABLE> void init_softsign_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default softsign function:
-    //   result = x / (abs(x) + 1)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
-        float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = softsign_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void softsign(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t softsign_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_softsign_table<CONFIG_T, CONFIG_T::table_size>(softsign_table);
-        initialized = true;
-    }
-
-    //#pragma HLS PIPELINE
-
-    // Index into the lookup table based on data
-    int data_round;
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * CONFIG_T::table_size / 16;
-        index = data_round + 8 * CONFIG_T::table_size / 16;
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        res[ii] = (res_T)softsign_table[index];
-    }
-}
-
-// *************************************************
-//       ELU Activation
-// *************************************************
-inline float elu_fcn_float(float input) { return std::exp(input) - 1.; }
-
-template <typename CONFIG_T, int N_TABLE> void init_elu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default ELU function:
-    //   result = alpha * (e^(x) - 1)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
-        float in_val = -8.0 * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = elu_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void elu(data_T data[CONFIG_T::n_in], const res_T alpha, res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t elu_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_elu_table<CONFIG_T, CONFIG_T::table_size>(elu_table);
-        initialized = true;
-    }
-
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    // Index into the lookup table based on data
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg >= 0) {
-            res[ii] = datareg;
-        } else {
-            index = datareg * CONFIG_T::table_size / -8;
-            if (index > CONFIG_T::table_size - 1)
-                index = CONFIG_T::table_size - 1;
-            res[ii] = alpha * elu_table[index];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void elu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    elu<data_T, res_T, CONFIG_T>(data, 1.0, res);
-}
-
-// *************************************************
-//       SELU Activation
-// *************************************************
-inline float selu_fcn_float(float input) {
-    return 1.0507009873554804934193349852946 * (1.6732632423543772848170429916717 * (std::exp(input) - 1.));
-}
-
-template <typename CONFIG_T, int N_TABLE> void init_selu_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    // Default SELU function:
-    //   result = 1.05 * (1.673 * (e^(x) - 1))
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range -8 to 0)
-        float in_val = -8.0 * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        typename CONFIG_T::table_t real_val = selu_fcn_float(in_val);
-        // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
-        table_out[ii] = real_val;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T> void selu(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    // Initialize the lookup table
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t selu_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_selu_table<CONFIG_T, CONFIG_T::table_size>(selu_table);
-        initialized = true;
-    }
-
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    // Index into the lookup table based on data
-    int index;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg >= 0) {
-            res[ii] = res_T(1.0507009873554804934193349852946) * datareg;
-        } else {
-            index = datareg * CONFIG_T::table_size / -8;
-            if (index > CONFIG_T::table_size - 1)
-                index = CONFIG_T::table_size - 1;
-            res[ii] = selu_table[index];
-        }
-    }
-}
-
-// *************************************************
-//       PReLU Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void prelu(data_T data[CONFIG_T::n_in], data_T alpha[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            res[ii] = datareg;
-        else
-            res[ii] = alpha[ii] * datareg;
-    }
-}
-
-// *************************************************
-//       Binary TanH Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void binary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    res_T cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        if (datareg > 0)
-            cache = 1;
-        else
-            cache = -1;
-
-        res[ii] = (res_T)cache;
-    }
-}
-
-// *************************************************
-//       Ternary TanH Activation
-// *************************************************
-template <class data_T, class res_T, typename CONFIG_T>
-void ternary_tanh(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    //#pragma HLS PIPELINE
-
-    data_T datareg;
-    res_T cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = 2 * data[ii];
-        if (datareg > 1)
-            cache = 1;
-        else if (datareg > -1 && datareg <= 1)
-            cache = 0;
-        else
-            cache = -1;
-
-        res[ii] = (res_T)cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak
deleted file mode 100644
index 843f303057..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_array.h.bak
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef NNET_ARRAY_H_
-#define NNET_ARRAY_H_
-
-#include <math.h>
-
-namespace nnet {
-
-struct transpose_config {
-    static const unsigned height = 10;
-    static const unsigned width = 10;
-    static const unsigned depth = 10;
-    static constexpr unsigned perm[3] = {2, 0, 1};
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
-    //#pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::height; i++) {
-        for (int j = 0; j < CONFIG_T::width; j++) {
-            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
-                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
-    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
-    unsigned dims_t[3];
-    dims_t[0] = dims[CONFIG_T::perm[0]];
-    dims_t[1] = dims[CONFIG_T::perm[1]];
-    dims_t[2] = dims[CONFIG_T::perm[2]];
-
-    int idx[3] = {0}, idx_t[3] = {0};
-    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
-        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
-            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
-                idx_t[0] = idx[CONFIG_T::perm[0]];
-                idx_t[1] = idx[CONFIG_T::perm[1]];
-                idx_t[2] = idx[CONFIG_T::perm[2]];
-
-                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
-                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak
deleted file mode 100644
index a4e4441311..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h.bak
+++ /dev/null
@@ -1,124 +0,0 @@
-#ifndef NNET_BATCHNORM_H_
-#define NNET_BATCHNORM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <math.h>
-
-namespace nnet {
-
-struct batchnorm_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-    data_T cache;
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    //#pragma HLS function_instantiate variable=scale,bias
-
-    // For parallel inputs:
-    //   - completely partition arrays -- target fabric
-    //   - if we have an unroll factor, limit number of multipliers
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    //#pragma HLS ARRAY_PARTITION variable=scale complete
-    //#pragma HLS ARRAY_PARTITION variable=bias complete
-
-    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-// Calcuate result
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
-        if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
-                        bias[ires];
-        } else {
-            int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
-                bias[norm_index];
-        }
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-struct batchnorm_quantized_tanh_config {
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-};
-
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in],
-                           data_T threshold[CONFIG_T::n_scale_bias]) {
-    //#pragma HLS PIPELINE
-    //#pragma HLS ARRAY_PARTITION variable=res complete
-
-    data_T datareg;
-    ap_uint<1> cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg >= threshold[norm_index])
-            cache = 1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in],
-                            data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) {
-    //#pragma HLS PIPELINE
-    //#pragma HLS ARRAY_PARTITION variable=res complete
-
-    data_T datareg;
-    ap_int<2> cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg > threshold_hi[norm_index])
-            cache = 1;
-        else if (datareg <= threshold_lo[norm_index])
-            cache = -1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak
deleted file mode 100644
index 21514e3c79..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h.bak
+++ /dev/null
@@ -1,123 +0,0 @@
-#ifndef NNET_BATCHNORM_STREAM_H_
-#define NNET_BATCHNORM_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include "nnet_types.h"
-
-namespace nnet {
-
-// ****************************************************
-//       Streaming Batch Normalization
-// ****************************************************
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(hls::stream<data_T> &data, hls::stream<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-    //#pragma HLS ARRAY_PARTITION variable=scale complete
-    //#pragma HLS ARRAY_PARTITION variable=bias complete
-
-    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
-    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-BatchNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        //#pragma HLS PIPELINE II=ii
-
-        data_T in_data = data.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormpack:
-        for (int j = 0; j < data_T::size; j++) {
-            //#pragma HLS UNROLL
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
-                              in_data[j], scale[norm_index]) +
-                          bias[norm_index];
-        }
-
-        res.write(out_data);
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias>> &res,
-                           typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
-    //#pragma HLS ARRAY_PARTITION variable=threshold complete
-
-BinaryNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        //#pragma HLS PIPELINE
-
-        data_T in_data = data.read();
-        nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias> out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormPack:
-        for (int j = 0; j < data_T::size; j++) {
-            //#pragma HLS UNROLL
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_int<2>, CONFIG_T::n_scale_bias>> &res,
-                            typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
-                            typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
-    //#pragma HLS ARRAY_PARTITION variable=threshold_hi complete
-    //#pragma HLS ARRAY_PARTITION variable=threshold_lo complete
-
-TernaryNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        //#pragma HLS PIPELINE
-
-        data_T in_data = data.read();
-        nnet::array<ap_int<2>, CONFIG_T::n_scale_bias> out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormPack:
-        for (int j = 0; j < data_T::size; j++) {
-            //#pragma HLS UNROLL
-
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-
-            if (in_data[j] > threshold_hi[norm_index]) {
-                out_data[j] = 1;
-            } else if (in_data[j] <= threshold_lo[norm_index]) {
-                out_data[j] = -1;
-            } else {
-                out_data[j] = 0;
-            }
-        }
-
-        res.write(out_data);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak
deleted file mode 100644
index 7a65548bed..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h.bak
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef NNET_COMMON_H_
-#define NNET_COMMON_H_
-
-#include "ap_fixed.h"
-
-// This is a substitute for "ceil(n/(float)d)".
-#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
-#define MIN(n, d) (n > d ? d : n)
-#define MAX(n, d) (n > d ? n : d)
-
-#define STRINGIFY(x) #x
-#define EXPAND_STRING(x) STRINGIFY(x)
-
-#ifndef __VITIS_HLS__
-#define DATA_PACK_TXT HLS DATA_PACK variable =
-#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
-#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
-#else
-#define PRAGMA_DATA_PACK(variable)
-#endif
-
-namespace nnet {
-
-// Common type definitions
-enum io_type { io_parallel = 0, io_stream };
-enum strategy { latency, resource };
-
-/* ---
- * Balanced tree reduce implementation.
- * For use in scenarios where Vivado cannot expression balance
- * Reduces an array of inputs to a single value using the template binary operator 'Op',
- * for example summing all elements with Op_add, or finding the maximum with Op_max
- * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
- * before applying and accumulate the result over the rolled dimension.
- * --- */
-template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
-    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
-    if (N == 1) {
-        return x[0];
-    }
-    if (N == 2) {
-        return op(x[0], x[1]);
-    }
-    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
-}
-
-template <class T> class Op_add {
-  public:
-    T operator()(T a, T b) { return a + b; }
-};
-
-template <class T> class Op_and {
-  public:
-    T operator()(T a, T b) { return a && b; }
-};
-
-template <class T> class Op_or {
-  public:
-    T operator()(T a, T b) { return a || b; }
-};
-
-template <class T> class Op_max {
-  public:
-    T operator()(T a, T b) { return a >= b ? a : b; }
-};
-
-template <class T> class Op_min {
-  public:
-    T operator()(T a, T b) { return a <= b ? a : b; }
-};
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak
deleted file mode 100644
index 8ee579ccf2..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h.bak
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef NNET_CONV1D_H_
-#define NNET_CONV1D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv1d_latency.h"
-#include "nnet_conv1d_resource.h"
-#include <cstdlib>
-
-namespace nnet {
-
-struct conv1d_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Convolutional parameters
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned in_width = 10;
-    static const unsigned n_chan = 0;
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_width;
-    static const unsigned n_filt = 1;
-    static const unsigned stride_width = 1;
-    static const unsigned dilation = 1;
-    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
-
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0; // not used yet
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    //#pragma HLS INLINE region
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    //#pragma HLS INLINE region
-
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak
deleted file mode 100644
index 3ec7605df1..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h.bak
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef NNET_CONV1D_STREAM_H_
-#define NNET_CONV1D_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_conv_stream.h"
-
-namespace nnet {
-
-template <class data_T, typename CONFIG_T>
-void compute_scaled_indices_1d(const unsigned w_idx, ap_uint<CONFIG_T::filt_width> *pixel_idx) {
-    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
-
-ComputeIndex:
-    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
-        //#pragma HLS UNROLL
-        unsigned sw_idx =
-            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
-                wp_idx + p);
-        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-
-    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
-    const int win_depth = CONFIG_T::out_width;
-    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
-        //#pragma HLS STREAM variable=data_window[i_out] depth=win_depth
-    }
-
-    //#pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
-
-    res_T res_pack;
-    PRAGMA_DATA_PACK(res_pack)
-    unsigned outputs_ready = 0;
-
-    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
-    //#pragma HLS ARRAY_PARTITION variable=pixel_idx complete
-
-ReadInputWidth:
-    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
-        //#pragma HLS LOOP_FLATTEN
-        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
-            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        }
-        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
-        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
-                                                        biases, pixel_idx);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-
-ReadInputWidth:
-    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
-        //#pragma HLS LOOP_FLATTEN
-        if (CONFIG_T::strategy == nnet::latency) {
-            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        }
-        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    //#pragma HLS inline recursive
-    switch (CONFIG_T::implementation) {
-    case conv_implementation::linebuffer:
-        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        break;
-    case conv_implementation::encoded:
-        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        break;
-    }
-}
-
-} // namespace nnet
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak
deleted file mode 100644
index 5291fad408..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h.bak
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef NNET_CONV2D_H_
-#define NNET_CONV2D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv2d_latency.h"
-#include "nnet_conv2d_resource.h"
-#include <cstdlib>
-
-namespace nnet {
-
-struct conv2d_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Convolutional parameters
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned n_chan = 1;
-    static const unsigned filt_height = 1;
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_height * filt_width;
-    static const unsigned n_filt = 1;
-    static const unsigned stride_height = 1;
-    static const unsigned stride_width = 1;
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-    static const unsigned dilation_height = 1;
-    static const unsigned dilation_width = 1;
-
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0; // not used yet
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_cl(
-    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    //#pragma HLS INLINE region
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    //#pragma HLS INLINE region
-
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak
deleted file mode 100644
index b1af08a080..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h.bak
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef NNET_CONV2D_LATENCY_H_
-#define NNET_CONV2D_LATENCY_H_
-
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <cstdlib>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_latency_cl(
-    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
-    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
-
-    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
-    //#pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
-
-    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
-    //#pragma HLS ARRAY_PARTITION variable=mult complete
-
-    typename CONFIG_T::accum_t acc[mult_n_out];
-    //#pragma HLS ARRAY_PARTITION variable=acc complete
-
-    //#pragma HLS ARRAY_PARTITION variable=weights complete
-    //#pragma HLS ARRAY_PARTITION variable=biases complete
-
-    // Limit multipliers to control parallelization
-    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
-
-PartitionLoop:
-    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
-
-        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
-
-    PixelLoop:
-        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
-            //#pragma HLS UNROLL
-
-            data_T cache;
-
-        // Do the matrix-multiply
-        Product1:
-            for (int i_in = 0; i_in < mult_n_in; i_in++) {
-                //#pragma HLS UNROLL
-                cache = data_buf[i_pxl][i_in];
-            Product2:
-                for (int i_out = 0; i_out < mult_n_out; i_out++) {
-                    //#pragma HLS UNROLL
-                    mult[i_in * mult_n_out + i_out] =
-                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
-                            cache, weights[i_in * mult_n_out + i_out]);
-                }
-            }
-
-        // Initialize accumulator with input biases
-        ResetAccum:
-            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
-                //#pragma HLS UNROLL
-                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
-            }
-
-        // Accumulate multiplication result
-        Accum1:
-            for (int i_in = 0; i_in < mult_n_in; i_in++) {
-                //#pragma HLS UNROLL
-            Accum2:
-                for (int i_out = 0; i_out < mult_n_out; i_out++) {
-                    //#pragma HLS UNROLL
-                    acc[i_out] += mult[i_in * mult_n_out + i_out];
-                }
-            }
-
-        // Cast to "res_t" type
-        Result:
-            for (int i_res = 0; i_res < mult_n_out; i_res++) {
-                //#pragma HLS UNROLL
-                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
-            }
-        }
-    }
-}
-
-} // namespace nnet
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak
deleted file mode 100644
index ee723f74e9..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h.bak
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef NNET_DENSE_H_
-#define NNET_DENSE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense_latency.h"
-#include "nnet_dense_resource.h"
-#include "nnet_dense_seq.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-struct dense_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned strategy = latency;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-    // Product function to use
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
-           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    //#pragma HLS inline
-    if (CONFIG_T::seq_len > 1) {
-        dense_seq<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        if (CONFIG_T::strategy == nnet::latency) {
-            dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        } else {
-            dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        }
-    }
-
-    //    std::cout << "out Dense: " << std::endl;
-    //    for(int i=0; i < CONFIG_T::n_out*CONFIG_T::seq_len; ++i) {
-    //        std::cout << res[i] << " ";
-    //    }
-    //    std::cout << std::endl;
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak
deleted file mode 100644
index 02e56e532b..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h.bak
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef NNET_COMPRESSED_LAYER_H_
-#define NNET_COMPRESSED_LAYER_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <math.h>
-
-namespace nnet {
-
-template <typename CONFIG_T>
-void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
-               typename CONFIG_T::accum_t weight) {
-    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
-        //#pragma HLS UNROLL
-        if (k == index)
-            mult[k] += weight;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
-                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    //#pragma HLS ARRAY_PARTITION variable=acc    complete
-    //#pragma HLS ARRAY_PARTITION variable=biases complete
-    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
-
-#ifdef __VITIS_HLS__
-    //#pragma HLS AGGREGATE variable=weights
-#else
-    //#pragma HLS data_pack variable=weights struct_level
-#endif
-
-InitAccum:
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        //#pragma HLS UNROLL
-        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
-    }
-
-    // Do the compressed matrix-multiply
-    const int rufactor = CONFIG_T::reuse_factor;
-ReuseLoop:
-    for (unsigned ir = 0; ir < rufactor; ir++) {
-        //#pragma HLS PIPELINE  II=1 rewind
-
-        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
-        //#pragma HLS ARRAY_PARTITION variable=mult complete
-
-    ResetMult:
-        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
-            //#pragma HLS UNROLL
-            mult[imult] = 0;
-        }
-
-    CompressedMultLoop:
-        for (unsigned im = 0; im < multiplier_limit; im++) {
-            //#pragma HLS UNROLL
-            unsigned w = im * rufactor + ir;
-            auto row = weights[w].row_index;
-            auto col = weights[w].col_index;
-            auto weight_cache = weights[w].weight;
-            data_T data_cache = data[row];
-            // mult[col] += weight_cache * data_cache;
-            typename CONFIG_T::accum_t prod =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
-            fill_mult<CONFIG_T>(col, mult, prod);
-        }
-
-        for (int im = 0; im < CONFIG_T::n_out; im++) {
-            acc[im] += mult[im];
-        }
-    }
-
-// Cast to "res_t" type
-ResultLoop:
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        //#pragma HLS UNROLL
-        // res[i] = (res_T) (acc[i]);
-        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak
deleted file mode 100644
index 81c137e54e..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h.bak
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef NNET_DENSE_LATENCY_H_
-#define NNET_DENSE_LATENCY_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    data_T cache;
-    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    //#pragma HLS function_instantiate variable=weights,biases
-
-    // For parallel inputs:
-    //   - completely partition arrays -- target fabric
-    //   - if we have an unroll factor, limit number of multipliers
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    //#pragma HLS ARRAY_PARTITION variable=biases complete
-    //#pragma HLS ARRAY_PARTITION variable=mult complete
-    //#pragma HLS ARRAY_PARTITION variable=acc complete
-
-    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-// Do the matrix-multiply
-Product1:
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        cache = data[ii];
-    Product2:
-        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
-            int index = ii * CONFIG_T::n_out + jj;
-            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
-        }
-    }
-
-// Initialize accumulator with input biases
-ResetAccum:
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-// Accumulate multiplication result
-Accum1:
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-    Accum2:
-        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
-            int index = ii * CONFIG_T::n_out + jj;
-            acc[jj] += mult[index];
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        // res[ires] = (res_T) (acc[ires]);
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak
deleted file mode 100644
index 17ef1930fa..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h.bak
+++ /dev/null
@@ -1,263 +0,0 @@
-#ifndef NNET_DENSE_RESOURCE_H_
-#define NNET_DENSE_RESOURCE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <assert.h>
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = CONFIG_T::reuse_factor;
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
-
-    //#pragma HLS function_instantiate variable=weights,biases
-    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    //#pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    //#pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        //#pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        //#pragma HLS PIPELINE II=1 rewind
-
-        int w_index = ir;
-        int in_index = ir;
-        int out_index = 0;
-        int acc_step = 0;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            //#pragma HLS UNROLL
-
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
-
-            // Increment w_index
-            w_index += rufactor;
-            // Increment in_index
-            in_index += rufactor;
-            if (in_index >= nin) {
-                in_index = ir;
-            }
-            // Increment out_index
-            if (acc_step + 1 >= multscale) {
-                acc_step = 0;
-                out_index++;
-            } else {
-                acc_step++;
-            }
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        //#pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
-
-    //#pragma HLS function_instantiate variable=weights,biases
-    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    //#pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    //#pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        //#pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-    int w_index;
-    int in_index = 0;
-    int out_index;
-    int outstep = 0;
-    const int outscale = rufactor / nin;
-
-    int outidx[rufactor];
-IndexLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        outidx[ir] = outstep;
-        if ((ir + 1) % nin == 0) {
-            outstep++;
-        }
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        //#pragma HLS PIPELINE II=1 rewind
-
-        w_index = ir;
-        out_index = outidx[ir] /*outstep*/;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            //#pragma HLS UNROLL
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
-
-            w_index += rufactor;
-            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
-                break; // check out of bounds
-            out_index += outscale;
-        }
-
-        in_index++;
-        if (in_index >= nin) {
-            in_index = 0;
-            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        //#pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = CONFIG_T::reuse_factor;
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
-
-    //#pragma HLS function_instantiate variable=weights,biases
-    ////#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    //#pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    //#pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    //#pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        //#pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        //#pragma HLS PIPELINE II=1 rewind
-        typename CONFIG_T::accum_t tmpmult[block_factor];
-        //#pragma HLS ARRAY_PARTITION variable=tmpmult complete
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            //#pragma HLS UNROLL
-            int w_index = ir + rufactor * im;
-            int in_index = w_index % nin;
-            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
-                continue; // check out of bounds
-            tmpmult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
-        }
-
-        typename CONFIG_T::accum_t mult[multiplier_limit];
-        //#pragma HLS ARRAY_PARTITION variable=mult complete
-
-    ResetMult:
-        for (int imult = 0; imult < multiplier_limit; imult++) {
-            //#pragma HLS UNROLL
-            mult[imult] = 0;
-        }
-
-    AccumLoop1:
-        for (int im = 0; im < block_factor; im++) {
-            //#pragma HLS UNROLL
-            int w_index = ir + rufactor * im;
-            int out_index = w_index / multfactor;
-            if (out_index >= multiplier_limit)
-                continue; // check out of bounds
-            mult[out_index] += tmpmult[im];
-        }
-
-    AccumLoop2:
-        for (int im = 0; im < multiplier_limit; im++) {
-            //#pragma HLS UNROLL
-            // int out_index = im/multscale; // This is the general case
-            // acc[out_index] += mult[im];
-            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        //#pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    //#pragma HLS INLINE recursive
-
-    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
-        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak
deleted file mode 100644
index 53b9ec480b..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h.bak
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef NNET_DENSE_SEQ_H_
-#define NNET_DENSE_SEQ_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_seq(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
-               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    //#pragma HLS inline
-
-    data_T in_val[CONFIG_T::n_in];
-    //#pragma HLS ARRAY_PARTITION variable=in_val complete
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-            for (int i = 0; i < CONFIG_T::n_in; ++i) {
-                //#pragma HLS UNROLL
-                in_val[i] = data[j * CONFIG_T::n_in + i];
-            }
-            dense_latency<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
-        }
-    } else {
-        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-            //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-            for (int i = 0; i < CONFIG_T::n_in; ++i) {
-                //#pragma HLS UNROLL
-                in_val[i] = data[j * CONFIG_T::n_in + i];
-            }
-            dense_resource<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak
deleted file mode 100644
index adcbad6afb..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h.bak
+++ /dev/null
@@ -1,816 +0,0 @@
-#ifndef NNET_GARNET_H_
-#define NNET_GARNET_H_
-
-#include "hls_math.h"
-#include "hls_stream.h"
-#include "nnet_common.h"
-
-namespace nnet {
-namespace garnet_utils {
-
-template <class CONFIG_T>
-inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
-initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    typedef ap_uint<CONFIG_T::distance_width> index_t;
-
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-
-    index_t index;
-    typename CONFIG_T::distance_t distance;
-
-    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
-    edge_weights_table[0] = ap_ufixed<CONFIG_T::edge_weight_t::width, 0, AP_TRN, AP_SAT>(1.);
-
-    for (unsigned iw = 1; iw < table_size; ++iw) {
-        index = iw;
-        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
-        edge_weights_table[iw] = hls::exp(-distance * distance);
-    }
-}
-
-template <class CONFIG_T>
-inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
-initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-    double const step = 64. / table_size;
-
-    typename CONFIG_T::distance_t v = -32.;
-    for (unsigned iw = 0; iw < table_size; ++iw) {
-        edge_weights_table[iw] = std::exp(-v * v);
-        v += step;
-    }
-}
-
-template <class CONFIG_T>
-inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
-get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    typedef ap_uint<CONFIG_T::distance_width> index_t;
-
-    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
-
-    return edge_weights_table[index];
-}
-
-template <class CONFIG_T>
-inline
-    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
-    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-    double const step = 64. / table_size;
-
-    int index = (distance + 32.) / step;
-    if (index < 0)
-        index = 0;
-    else if (index >= table_size)
-        index = table_size - 1;
-
-    return edge_weights_table[index];
-}
-
-template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
-    if (CONFIG_T::is_stack) {
-        //#pragma HLS INLINE OFF
-    }
-#ifdef __SYNTHESIS__
-    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
-    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
-    // CONFIG_T::reuse_factor);
-    // //#pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
-    bool initialized = false;
-#else
-    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
-    static bool initialized = false;
-#endif
-    if (not initialized) {
-        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
-        initialized = true;
-    }
-
-    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
-}
-
-template <class dividend_T, class exponent_T>
-inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
-                                                                                                  exponent_T exponent) {
-    //#pragma HLS INLINE
-    return dividend >> exponent;
-}
-
-template <class dividend_T, class exponent_T>
-inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
-                                                                                                      exponent_T exponent) {
-    //#pragma HLS INLINE
-    return dividend / std::pow(2., exponent);
-}
-
-template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
-    typedef E edge_weight_t;
-
-    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
-    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
-
-    Means() {
-        //#pragma HLS INLINE
-        //#pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
-        //#pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
-        //#pragma HLS UNROLL region
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] = 0.;
-
-        InFeatures:
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-                weighted_feature_mean[iax] = 0.;
-            }
-        }
-    }
-
-    void set_weight(unsigned, edge_weight_t const &) {
-        //#pragma HLS INLINE
-    }
-
-    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
-        //#pragma HLS INLINE
-        // Always called within a pipelined region - no UNROLL needed
-
-        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
-
-        InFeatures:
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
-            }
-        }
-    }
-
-    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
-    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
-        //#pragma HLS INLINE
-        //#pragma HLS UNROLL region
-
-        // accum comes divided by unroll factor
-        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
-
-    Aggregators:
-        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
-
-        InFeatures:
-            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
-                unsigned const iax = ia * T::n_in_features + ix;
-
-                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
-            }
-        }
-    }
-
-    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
-    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
-        //#pragma HLS INLINE
-        //#pragma HLS UNROLL region
-
-    Aggregators:
-        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
-
-            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
-
-        InFeatures:
-            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
-                unsigned const iax = ia * T::n_in_features + ix;
-
-                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
-            }
-        }
-    }
-};
-
-template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
-    typedef E edge_weight_t;
-
-    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
-
-    WeightsAndMeans() : Means<CONFIG_T, E>() {
-        //#pragma HLS INLINE
-        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
-        //#pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
-    }
-
-    void set_weight(unsigned iva, edge_weight_t const &weight) {
-        //#pragma HLS INLINE
-        edge_weights[iva] = weight;
-    }
-};
-
-template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
-
-template <class CONFIG_T, class nvtx_T>
-struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
-    typedef typename CONFIG_T::output_transform_biases_t biases_t;
-
-    biases_t const (&output_biases)[CONFIG_T::n_out_features];
-
-    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
-        //#pragma HLS INLINE
-    }
-};
-
-template <class CONFIG_T, class nvtx_T>
-struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
-    typedef typename CONFIG_T::output_transform_biases_t biases_t;
-
-    biases_t output_biases[CONFIG_T::n_out_features];
-
-    OutputBiasNormalizer(nvtx_T const nvtx) {
-        //#pragma HLS ARRAY_PARTITION variable=output_biases complete
-        //#pragma HLS UNROLL region
-
-        // Cannot add a loop label here due to a Vivado HLS bug, apparently
-        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
-            bias *= nvtx;
-            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
-        }
-    }
-};
-
-template <class CONFIG_T, class data_T> struct InputDataGetter {
-    typedef data_T data_t;
-
-    data_T const *dataref;
-
-    InputDataGetter(data_T const *d) : dataref{d} {
-        //#pragma HLS INLINE
-    }
-    data_T const &get(unsigned iv, unsigned ix) const {
-        //#pragma HLS INLINE
-        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-        return dataref[ivx];
-    }
-};
-
-template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
-    typedef data_T data_t;
-
-    data_T const (&dataref)[CONFIG_T::n_in_features];
-
-    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
-        //#pragma HLS INLINE
-    }
-    data_T const &get(unsigned, unsigned ix) const {
-        //#pragma HLS INLINE
-        return dataref[ix];
-    }
-};
-
-template <class CONFIG_T, class res_T> struct OutputResSetter {
-    typedef res_T res_t;
-
-    res_T *resref;
-
-    OutputResSetter(res_T *r) : resref{r} {
-        //#pragma HLS INLINE
-    }
-    void set(unsigned iv, unsigned io, res_T const &acc) {
-        //#pragma HLS INLINE
-        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-        resref[ivo] = acc;
-    }
-};
-
-template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
-    typedef res_T res_t;
-
-    res_T (&resref)[CONFIG_T::n_out_features];
-
-    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
-        //#pragma HLS INLINE
-    }
-    void set(unsigned, unsigned io, res_T const &acc) {
-        //#pragma HLS INLINE
-        resref[io] = acc;
-    }
-};
-
-template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
-inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
-                                       arrays_T &arrays) {
-    //#pragma HLS INLINE
-
-Aggregators:
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
-
-    InFeatures1:
-        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
-
-            distance += incr;
-        }
-
-        typename CONFIG_T::edge_weight_t edge_weight =
-            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
-
-        arrays_local.edge_weight_mean[ia] += edge_weight;
-
-    InFeatures2:
-        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
-
-            arrays_local.weighted_feature_mean[iax] += incr;
-        }
-
-        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-        arrays.set_weight(iva, edge_weight);
-    }
-}
-
-template <class CONFIG_T, class arrays_T>
-inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
-    //#pragma HLS INLINE
-    //#pragma HLS UNROLL region
-
-    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
-
-InFeatures:
-    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
-        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
-    }
-
-    return aggr;
-}
-
-template <class CONFIG_T, class arrays_T>
-inline void compute_output_base(arrays_T const &arrays,
-                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
-    //#pragma HLS INLINE
-    //#pragma HLS UNROLL region
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
-        }
-    }
-}
-
-template <class CONFIG_T, class arrays_T, class res_setter_T>
-inline void
-compute_vertex_output(arrays_T const &arrays, unsigned iv,
-                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
-                      res_setter_T &res_setter) {
-    //#pragma HLS INLINE
-
-    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
-    //#pragma HLS ARRAY_PARTITION variable=edge_weights complete
-
-Aggregators1:
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-
-        edge_weights[ia] = arrays.edge_weights[iva];
-    }
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
-
-    Aggregators2:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
-            acc += incr;
-        }
-
-        res_setter.set(iv, io, acc);
-    }
-}
-
-template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
-void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
-    InputDataGetter<CONFIG_T, data_T> data_getter(data);
-
-    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
-
-    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
-        //#pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
-        }
-
-        means_accum.add_means_normalized(means_local);
-    }
-
-    arrays.set_means_normalized(nvtx, means_accum);
-}
-
-template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
-void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    OutputResSetter<CONFIG_T, res_T> res_setter(res);
-
-    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
-    //#pragma HLS ARRAY_PARTITION variable=output_base complete
-
-    compute_output_base<CONFIG_T>(arrays, output_base);
-
-    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
-        //#pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
-        }
-    }
-}
-
-template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
-void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
-                res_T res[CONFIG_T::n_out_features]) {
-    //#pragma HLS PIPELINE
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        res_T acc = output_transform_biases.output_biases[io];
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
-
-            acc += arrays.edge_weight_mean[ia] * aggr;
-        }
-
-        res[io] = acc;
-    }
-}
-
-template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
-void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
-    typedef typename prev_layer_t::output_t data_T;
-
-    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
-    //#pragma HLS ARRAY_PARTITION variable=prev_output_base complete
-
-    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
-
-    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
-
-    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
-        //#pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            data_T data[prev_layer_t::n_out_features];
-            //#pragma HLS ARRAY_PARTITION variable=data complete
-
-            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
-
-            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
-
-            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
-
-            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
-        }
-
-        means_accum.add_means_normalized(means_local);
-    }
-
-    current_arrays.set_means_normalized(nvtx, means_accum);
-}
-
-template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
-          class last_arrays_T>
-inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
-sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
-    //#pragma HLS INLINE
-
-    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
-}
-
-template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
-          class last_arrays_T>
-inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
-sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
-    //#pragma HLS INLINE
-
-    WeightsAndMeans<current_layer_t> current_arrays;
-
-    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
-
-    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
-}
-} // namespace garnet_utils
-
-struct garnet_config {
-    // Layer specs
-    static const unsigned n_vertices_width = 8;
-    static const unsigned n_vertices = (1 << n_vertices_width);
-    static const unsigned n_in_features = 4;
-    static const unsigned n_propagate = 4;
-    static const unsigned n_aggregators = 4;
-    static const unsigned n_out_features = 4;
-    static const unsigned distance_width = 12;
-
-    // Internal data type definitions
-    typedef float input_transform_weights_t;
-    typedef float input_transform_biases_t;
-    typedef float output_transform_weights_t;
-    typedef float output_transform_biases_t;
-    typedef float aggregator_distance_weights_t;
-    typedef float aggregator_distance_biases_t;
-
-    typedef float norm_t;
-    typedef float distance_t;
-    typedef float edge_weight_t;
-    typedef float edge_weight_aggr_t;
-    typedef float aggr_t;
-    typedef float output_t;
-
-    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
-    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
-    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
-    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
-    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
-
-    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
-
-    static const unsigned output_collapse = no_collapse;
-
-    static const bool mean_by_nvert = false;
-    static const bool is_stack = false;
-
-    // Optimization specs
-    static const unsigned reuse_factor = 64;
-    static const unsigned log2_reuse_factor = 6;
-};
-
-// vertices -> vertices
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    //#pragma HLS DATAFLOW
-
-    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
-
-    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
-
-    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
-}
-
-// vertices -> out features
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-       res_T res[CONFIG_T::n_out_features]) {
-    //#pragma HLS DATAFLOW
-
-    garnet_utils::Means<CONFIG_T> arrays;
-
-    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
-
-    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
-
-    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
-}
-
-// vertices -> vertices
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    //#pragma HLS DATAFLOW
-
-    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
-    unsigned const ilast = CONFIG_T::n_sublayers - 1;
-    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
-
-    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
-    garnet_utils::Means<last_layer_t> arrays_last;
-
-    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
-
-    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
-                                                                                              arrays_last);
-
-    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
-}
-
-// vertices -> out features
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-             res_T res[CONFIG_T::n_out_features]) {
-    //#pragma HLS DATAFLOW
-
-    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
-    unsigned const ilast = CONFIG_T::n_sublayers - 1;
-    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
-
-    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
-    garnet_utils::Means<last_layer_t> arrays_last;
-
-    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
-
-    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
-                                                                                              arrays_last);
-
-    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
-
-    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
-}
-
-/* Reference (dumb) implementation returning (Vertices, Features) */
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
-    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
-
-    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-        if (iv == nvtx[0])
-            break;
-
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
-
-            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
-
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
-
-                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
-            }
-        }
-
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-
-            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
-
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
-            }
-
-            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
-        }
-    }
-
-    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
-
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-
-            aggregated_features[iap] = 0.;
-
-            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-                if (iv == nvtx[0])
-                    break;
-
-                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
-
-                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
-            }
-        }
-    }
-
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-
-            if (CONFIG_T::mean_by_nvert)
-                aggregated_features[iap] /= nvtx[0];
-            else {
-                // Not using right shift in case aggr_t is float or double
-                aggregated_features[iap] /= CONFIG_T::n_vertices;
-            }
-        }
-    }
-
-    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-        if (iv == nvtx[0])
-            break;
-
-        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-
-            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
-
-            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-                typename CONFIG_T::aggr_t aggr = 0.;
-
-                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
-
-                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
-                }
-
-                acc += edge_weights[iva] * aggr;
-            }
-
-            res[ivo] = acc;
-        }
-    }
-}
-
-/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-           res_T res[CONFIG_T::n_out_features]) {
-    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
-
-    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
-
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        typename CONFIG_T::aggr_t acc = 0.;
-
-        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-            if (iv == nvtx[0])
-                break;
-
-            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-
-            acc += vertex_res[ivo];
-        }
-
-        if (CONFIG_T::mean_by_nvert)
-            acc /= nvtx[0];
-        else {
-            // Not using right shift in case aggr_t is float or double
-            acc /= CONFIG_T::n_vertices;
-        }
-
-        res[io] = acc;
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak
deleted file mode 100644
index 1a3a3d28b5..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h.bak
+++ /dev/null
@@ -1,382 +0,0 @@
-#ifndef NNET_HELPERS_H
-#define NNET_HELPERS_H
-
-#include "hls_stream.h"
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
-namespace nnet {
-
-#ifndef __SYNTHESIS__
-
-#ifndef WEIGHTS_DIR
-#define WEIGHTS_DIR "weights"
-#endif
-
-template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-
-        size_t i = 0;
-        while (std::getline(iss, token, ',')) {
-            std::istringstream(token) >> w[i];
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-
-template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-        std::string extra_chars = "} ";
-
-        size_t i = 0;
-        while (std::getline(iss, token, '{')) {
-            if (token.length() == 0) {
-                continue;
-            }
-            for (char c : extra_chars) {
-                token.erase(std::remove(token.begin(), token.end(), c), token.end());
-            }
-            if (token.back() == ',') {
-                token.erase(token.end() - 1);
-            }
-
-            std::replace(token.begin(), token.end(), ',', ' ');
-            std::istringstream structss(token);
-
-            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
-                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
-                exit(1);
-            }
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-
-template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-        std::string extra_chars = "} ";
-
-        size_t i = 0;
-        while (std::getline(iss, token, '{')) {
-            if (token.length() == 0) {
-                continue;
-            }
-            for (char c : extra_chars) {
-                token.erase(std::remove(token.begin(), token.end(), c), token.end());
-            }
-            if (token.back() == ',') {
-                token.erase(token.end() - 1);
-            }
-
-            std::replace(token.begin(), token.end(), ',', ' ');
-            std::istringstream structss(token);
-
-            if (!(structss >> w[i].sign >> w[i].weight)) {
-                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
-                exit(1);
-            }
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = dstType(src[i]);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<dstType> &dst) {
-    for (size_t i = 0; i < SIZE / dstType::size; i++) {
-        dstType ctype;
-        for (size_t j = 0; j < dstType::size; j++) {
-            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
-        }
-        dst.write(ctype);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stream<srcType> &src, dstType *dst) {
-    for (size_t i = 0; i < SIZE / srcType::size; i++) {
-        srcType ctype = src.read();
-        for (size_t j = 0; j < srcType::size; j++) {
-            dst[i * srcType::size + j] = dstType(ctype[j]);
-        }
-    }
-}
-
-extern bool trace_enabled;
-extern std::map<std::string, void *> *trace_outputs;
-extern size_t trace_type_size;
-
-template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
-    for (int i = 0; i < layer_size; i++) {
-        ptr[i] = save_T(data[i]);
-    }
-}
-
-template <class data_T, class save_T> void save_output_array(hls::stream<data_T> &data, save_T *ptr, size_t layer_size) {
-    for (size_t i = 0; i < layer_size / data_T::size; i++) {
-        data_T ctype = data.read();
-        for (size_t j = 0; j < data_T::size; j++) {
-            ptr[i * data_T::size + j] = save_T(ctype[j]);
-        }
-        data.write(ctype);
-    }
-}
-
-// We don't want to include save_T in this function because it will be inserted into myproject.cpp
-// so a workaround with element size is used
-template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (int i = 0; i < layer_size; i++) {
-            out << float(data[i]) << " "; // We don't care about precision in text files
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-template <class data_T> void save_layer_output(hls::stream<data_T> &data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (size_t i = 0; i < layer_size / data_T::size; i++) {
-            data_T ctype = data.read();
-            for (size_t j = 0; j < data_T::size; j++) {
-                out << float(ctype[j]) << " "; // We don't care about precision in text files
-            }
-            data.write(ctype);
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-#endif
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
-    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
-    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
-    std::copy(in_begin, in_end, dst);
-}
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
-void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
-    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
-    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
-
-    size_t i_pack = 0;
-    dst_T dst_pack;
-    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
-        dst_pack[i_pack++] = typename dst_T::value_type(*i);
-        if (i_pack == dst_T::size) {
-            i_pack = 0;
-            dst.write(dst_pack);
-        }
-    }
-}
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
-    for (auto i = 0; i < SIZE; i++)
-        if (i == SIZE - 1) {
-            dst[i].data = src[i];
-            dst[i].last = 1;
-        } else {
-            dst[i].data = src[i];
-            dst[i].last = 0;
-        }
-}
-
-template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
-    for (int i = 0; i < SIZE; i++) {
-        out << result[i] << " ";
-    }
-    out << std::endl;
-}
-
-template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
-    for (int i = 0; i < SIZE / res_T::size; i++) {
-        res_T res_pack = result.read();
-        for (int j = 0; j < res_T::size; j++) {
-            out << res_pack[j] << " ";
-        }
-        if (keep)
-            result.write(res_pack);
-    }
-    out << std::endl;
-}
-
-template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
-
-template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
-    for (int i = 0; i < SIZE / data_T::size; i++) {
-        data_T data_pack;
-        for (int j = 0; j < data_T::size; j++) {
-            data_pack[j] = 0.;
-        }
-        data.write(data_pack);
-    }
-}
-
-template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
-    FILE *fp;
-    fp = fopen(filename, "r");
-    if (fp == 0) {
-        return -1;
-    }
-    // Read data from file
-    float newval;
-    for (int ii = 0; ii < nrows; ii++) {
-        if (fscanf(fp, "%f\n", &newval) != 0) {
-            data[ii] = newval;
-        } else {
-            return -2;
-        }
-    }
-    fclose(fp);
-    return 0;
-}
-
-template <class dataType, unsigned int nrows, unsigned int ncols>
-int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
-    FILE *fp;
-    fp = fopen(filename, "r");
-    if (fp == 0) {
-        return -1;
-    }
-    // Read data from file
-    float newval;
-    for (int ii = 0; ii < nrows; ii++) {
-        for (int jj = 0; jj < ncols; jj++) {
-            if (fscanf(fp, "%f\n", &newval) != 0) {
-                data[ii][jj] = newval;
-            } else {
-                return -2;
-            }
-        }
-    }
-    fclose(fp);
-    return 0;
-}
-
-template <class in_T, class out_T, int N_IN> void change_type(hls::stream<in_T> &in, hls::stream<out_T> &out) {
-    in_T datareg;
-    hls::stream<out_T> input_trunc;
-    for (int ii = 0; ii < N_IN; ii++) {
-        out << (out_T)in.read();
-    }
-}
-
-template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &data, hls::stream<data_T> &res) {
-    data_T datareg;
-    for (int ii = 0; ii < N_IN; ii++) {
-        datareg = data.read();
-        std::cout << "[" << ii << "]: " << datareg << std::endl;
-        res << datareg;
-    }
-}
-
-constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
-
-constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
-
-constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak
deleted file mode 100644
index ef8172f297..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h.bak
+++ /dev/null
@@ -1,404 +0,0 @@
-//
-//    rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
-//
-//    Copyright (C) 2017 EJ Kreinar
-//
-//    This program is free software: you can redistribute it and/or modify
-//    it under the terms of the GNU General Public License as published by
-//    the Free Software Foundation, either version 3 of the License, or
-//    (at your option) any later version.
-//
-//    This program is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU General Public License for more details.
-//
-//    You should have received a copy of the GNU General Public License
-//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-//
-
-#ifndef NNET_LAYERNORM_H_
-#define NNET_LAYERNORM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <iostream>
-#include <math.h>
-
-#include "hls_math.h"
-// #include "ap_fixed.h"
-
-namespace nnet {
-
-struct layernorm_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-    typedef ap_fixed<16, 8> mean_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 20;
-    static const unsigned seq_len = 4;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    float inv_range = CONFIG_T::table_range;
-    // Inversion function:
-    //   result = 1/sqrt(x)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
-        float in_val = inv_range * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        if (in_val > 0.0)
-            table_out[ii] = 1.0 / sqrt(in_val);
-        else
-            table_out[ii] = 0.0;
-    }
-}
-
-template <typename CONFIG_T, int N_TABLE> void init_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    float inv_range = 0.5; /// if not acurrate increase this
-    // Inversion function:
-    //   result = 1/sqrt(x)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
-        float in_val = inv_range * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        if (in_val > 0.0)
-            table_out[ii] = sqrt(in_val);
-        else
-            table_out[ii] = 0.0;
-    }
-}
-
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-// )
-// {
-// //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-// //#pragma HLS ARRAY_PARTITION variable=data complete
-// //#pragma HLS ARRAY_PARTITION variable=res complete
-
-// int inv_range_inv = (int) 1/ 0.5;
-// typename CONFIG_T::table_t sqr = 0;
-// #ifdef __HLS_SYN__
-//     bool initialized = false;
-//     typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
-// #else
-//     static bool initialized = false;
-//     static typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
-// #endif
-//     if (!initialized) {
-//         init_sqr_table<CONFIG_T, CONFIG_T::table_size>(sqr_table);
-//         initialized = true;
-//     }
-
-//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//     data_T sum_cache = 0;
-//     data_T sum_cache2 = 0;
-//     data_T var, mean, diff, inv_sqr;
-//     data_T data_diff[dim];
-//     data_T data_norm[dim];
-
-//     //#pragma HLS ARRAY_PARTITION variable=data_diff complete
-//     //#pragma HLS ARRAY_PARTITION variable=data_diff complete
-
-//     const data_T k_inv = 1.0/dim;
-//     for (int i = 0; i < dim; ++i){
-//         sum_cache += data[i];
-//     }
-//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-
-//     for (int i = 0; i < dim; ++i){
-//         data_diff[i] = data[i] - mean;
-//         diff = data_diff[i]*data_diff[i];
-//         sum_cache2 += diff;
-//     }
-//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-
-//     int index = var*(CONFIG_T::table_size)*inv_range_inv;
-// 	if (index < 0)   index = 0;
-// 	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-// 	sqr = (typename CONFIG_T::table_t) sqr_table[index];
-//     inv_sqr = 1 / sqr;
-
-//     for (int i = 0; i < dim; ++i){
-//         res[i] = data_diff[i] * inv_sqr * scale[i] + bias[i];
-//     }
-
-// }
-
-//////////////////////
-// Dennis's version //
-//////////////////////
-template <class data_T, class res_T, typename CONFIG_T>
-void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CONFIG_T::n_in / CONFIG_T::seq_len],
-                  typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
-                  typename CONFIG_T::bias_t bias[CONFIG_T::n_in / CONFIG_T::seq_len]) {
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    //#pragma HLS ARRAY_PARTITION variable=data complete
-    //#pragma HLS ARRAY_PARTITION variable=res complete
-    int inv_range_inv = (int)1 / CONFIG_T::table_range;
-    typename CONFIG_T::table_t deno_inver = 0;
-#ifdef __HLS_SYN__
-    bool initialized = false;
-    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-#else
-    static bool initialized = false;
-    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-#endif
-    if (!initialized) {
-        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
-        initialized = true;
-    }
-
-    static const unsigned dim = CONFIG_T::n_in / CONFIG_T::seq_len;
-    typename CONFIG_T::mean_t sum_cache = 0;
-    typename CONFIG_T::mean_t sum_cache2 = 0;
-    typename CONFIG_T::mean_t var, mean, diff;
-    typename CONFIG_T::mean_t data_diff[dim];
-    typename CONFIG_T::mean_t data_norm[dim];
-    //    data_T sum_cache = 0;
-    //    data_T sum_cache2 = 0;
-    //    data_T var, mean, diff;
-    ////    typename CONFIG_T::mean_t mean;
-    ////    typename CONFIG_T::var_t var;
-    ////    typename CONFIG_T::diff_t diff;
-    //    data_T data_diff[dim];
-    //    data_T data_norm[dim];
-
-    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
-    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
-
-    const typename CONFIG_T::mean_t k_inv = 1.0 / dim;
-    for (int i = 0; i < dim; ++i) {
-        sum_cache += static_cast<typename CONFIG_T::mean_t>(data[i]);
-    }
-    mean = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache, k_inv);
-    //    std::cout << "mean: " << std::endl;
-    //    std::cout << mean << std::endl;
-
-    for (int i = 0; i < dim; ++i) {
-        data_diff[i] = static_cast<typename CONFIG_T::mean_t>(data[i]) - mean;
-        diff = data_diff[i] * data_diff[i];
-        sum_cache2 += diff;
-        //        std::cout << "data_diff: " << std::endl;
-        //        std::cout << data_diff[i] << std::endl;
-        //        std::cout << " " << std::endl;
-    }
-    var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
-    //    std::cout << "var: " << std::endl;
-    //    std::cout << var << std::endl;
-    //    std::cout << " " << std::endl;
-
-    int index = var * (CONFIG_T::table_size)*inv_range_inv;
-    if (CONFIG_T::table_range > 1)
-        index = var * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
-
-    if (index < 0)
-        index = 0;
-    if (index > CONFIG_T::table_size - 1)
-        index = CONFIG_T::table_size - 1;
-    deno_inver = (typename CONFIG_T::table_t)invert_sqr_table[index];
-    //    std::cout << "deno_inver: " << std::endl;
-    //    std::cout << deno_inver << std::endl;
-    //    std::cout << " " << std::endl;
-
-    //    std::cout << "index: " << std::endl;
-    //    std::cout << index << std::endl;
-    //    std::cout << " " << std::endl;
-
-    for (int i = 0; i < dim; ++i) {
-        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
-    }
-}
-////////////////////////
-// Original One Ethan's//
-////////////////////////
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-//)
-//{
-////#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-////#pragma HLS ARRAY_PARTITION variable=data complete
-////#pragma HLS ARRAY_PARTITION variable=res complete
-//
-// int inv_range_inv = (int) 1/ CONFIG_T::table_range;
-// typename CONFIG_T::table_t deno_inver = 0;
-//#ifdef __HLS_SYN__
-//    bool initialized = false;
-//    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-//#else
-//    static bool initialized = false;
-//    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-//#endif
-//    if (!initialized) {
-//        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
-//        initialized = true;
-//    }
-//
-//    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//    data_T sum_cache = 0;
-//    data_T sum_cache2 = 0;
-//    data_T var, mean, diff;
-//    data_T data_diff[dim];
-//    data_T data_norm[dim];
-//
-//    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
-//    //#pragma HLS ARRAY_PARTITION variable=data_diff complete
-//
-//    const data_T k_inv = 1.0/dim;
-//    for (int i = 0; i < dim; ++i){
-//        sum_cache += data[i];
-//    }
-////    mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-////    std::cout << "mean: " << std::endl;
-////    std::cout << mean << std::endl;
-//
-//    for (int i = 0; i < dim; ++i){
-//        data_diff[i] = data[i] - mean;
-//        diff = data_diff[i]*data_diff[i];
-//        sum_cache2 += diff;
-////        std::cout << "data_diff: " << std::endl;
-////        std::cout << data_diff[i] << std::endl;
-////        std::cout << " " << std::endl;
-//    }
-//    var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-////    std::cout << "var: " << std::endl;
-////    std::cout << var << std::endl;
-////    std::cout << " " << std::endl;
-//
-//    int index = var*(CONFIG_T::table_size)*inv_range_inv;
-//    if (CONFIG_T::table_range > 1) index = var*(CONFIG_T::table_size)/ (int)CONFIG_T::table_range;
-//
-//	if (index < 0)   index = 0;
-//	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-//	deno_inver = (typename CONFIG_T::table_t) invert_sqr_table[index];
-////    std::cout << "deno_inver: " << std::endl;
-////    std::cout << deno_inver << std::endl;
-////    std::cout << " " << std::endl;
-//
-////    std::cout << "index: " << std::endl;
-////    std::cout << index << std::endl;
-////    std::cout << " " << std::endl;
-//
-//    for (int i = 0; i < dim; ++i){
-//        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
-//    }
-//
-//}
-
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-// )
-// {
-// //#pragma HLS PIPELINE
-// //#pragma HLS ARRAY_PARTITION variable=data complete
-// //#pragma HLS ARRAY_PARTITION variable=res complete
-
-//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//     data_T sum_cache = 0;
-//     data_T sum_cache2 = 0;
-//     data_T var, mean, diff_squares, diff, var_eps_inv;
-//     data_T data_diff[dim];
-//     float sqrt_var_eps;
-
-//     //#pragma HLS ARRAY_PARTITION variable=data_diff complete
-
-//     const data_T k_inv = 1.0/dim;
-//     for (int i = 0; i < dim; ++i){
-//         sum_cache += data[i];
-//     }
-//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-//     // std::cout << "mean: " << std::endl;
-//     // std::cout << mean << std::endl;
-
-//     for (int i = 0; i < dim; ++i){
-//         diff = data[i] - mean;
-//         data_diff[i] = diff;
-//         diff_squares = diff*diff;
-//         sum_cache2 += diff_squares;
-//         // std::cout << "data_diff: " << std::endl;
-//         // std::cout << data_diff[i] << std::endl;
-//         // std::cout << " " << std::endl;
-//     }
-//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-//     float var_f = (float)var;
-//     // std::cout << "var: ";
-//     // std::cout << var << std::endl;
-
-//     sqrt_var_eps = sqrt(var_f);
-//     var_eps_inv = (data_T) (1 / (sqrt_var_eps));
-//     // std::cout << "var_eps_inv: " << std::endl;
-//     // std::cout << var_eps_inv << std::endl;
-//     // std::cout << " " << std::endl;
-
-//     for (int i = 0; i < dim; ++i){
-//         res[i] = data_diff[i] * var_eps_inv * scale[i] + bias[i];
-//     }
-
-// }
-
-template <class data_T, class res_T, typename CONFIG_T>
-void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-                    typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
-                    typename CONFIG_T::bias_t bias[CONFIG_T::n_in / CONFIG_T::seq_len]) {
-    static const unsigned dim = CONFIG_T::n_in / CONFIG_T::seq_len;
-    data_T in_val[dim];
-    data_T outval[dim];
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    //#pragma HLS function_instantiate variable=scale,bias
-
-    // //#pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    //#pragma HLS ARRAY_PARTITION variable=scale complete
-    //#pragma HLS ARRAY_PARTITION variable=bias complete
-    //#pragma HLS ARRAY_PARTITION variable=in_val complete
-    //#pragma HLS ARRAY_PARTITION variable=outval complete
-
-    // std::cout << "one seq norm layer: " << std::endl;
-    // std::cout << " " << std::endl;
-
-    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-        //#pragma HLS PIPELINE
-    load:
-        for (int i = 0; i < dim; ++i) {
-            //#pragma HLS UNROLL
-            in_val[i] = data[j * dim + i];
-        }
-        layernorm_1d<data_T, res_T, CONFIG_T>(in_val, outval, scale, bias);
-    store:
-        for (int i = 0; i < dim; ++i) {
-            //#pragma HLS UNROLL
-            res[j * dim + i] = outval[i];
-        }
-    }
-
-    //     std::cout << "out Dense: " << std::endl;
-    //     nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
-    //     std::cout << " " << std::endl;
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak
deleted file mode 100644
index c6f6dbbf95..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h.bak
+++ /dev/null
@@ -1,256 +0,0 @@
-#ifndef NNET_MERGE_H_
-#define NNET_MERGE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-struct merge_config {
-    static const unsigned n_elem = 10;
-};
-
-struct dot_config {
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 1;
-    static const unsigned reuse_factor = 1;
-    typedef float accum_t;
-    // Product function to use
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-struct concat_config {
-    static const unsigned n_elem1_0 = 10;
-    static const unsigned n_elem1_1 = 10;
-    static const unsigned n_elem1_2 = 10;
-    static const unsigned n_elem2_0 = 10;
-    static const unsigned n_elem2_1 = 10;
-    static const unsigned n_elem2_2 = 10;
-
-    static const unsigned axis = -1;
-};
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] + data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] - data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] * data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    //#pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
-    //#pragma HLS ARRAY_PARTITION variable=mult complete
-    typename CONFIG_T::accum_t acc = 0;
-
-Product:
-    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
-        //#pragma HLS UNROLL
-        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
-    }
-
-Accum:
-    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
-        //#pragma HLS UNROLL
-        acc += mult[i_acc];
-    }
-
-    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
-                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
-        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
-        }
-        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
-            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
-                data2[ii * CONFIG_T::n_elem2_1 + jj];
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    //#pragma HLS INLINE
-
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx =
-                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                res[res_idx] = data1[data_idx];
-            }
-        }
-        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
-                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
-                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
-                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
-                res[res_idx] = data2[data_idx];
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    //#pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
-                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                res[res_idx] = data1[data_idx];
-            }
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
-                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
-                res[res_idx] = data2[data_idx];
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    //#pragma HLS INLINE
-
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak
deleted file mode 100644
index 5cc89659fe..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h.bak
+++ /dev/null
@@ -1,370 +0,0 @@
-#ifndef NNET_MERGE_STREAM_H_
-#define NNET_MERGE_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-AddLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        //#pragma HLS PIPELINE
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    AddPack:
-        for (int j = 0; j < res_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j] = in_data1[j] + in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-SubtractLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        //#pragma HLS PIPELINE
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    SubtractPack:
-        for (int j = 0; j < res_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j] = in_data1[j] - in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MultiplyLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MultiplyPack:
-        for (int j = 0; j < res_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j] = in_data1[j] * in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-AverageLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    AveragePack:
-        for (int j = 0; j < res_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MaximumLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MaximumPack:
-        for (int j = 0; j < res_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MinimumLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MinimumPack:
-        for (int j = 0; j < res_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth1:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            //#pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                //#pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-ConcatLoopHeight2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-    ConcatLoopWidth2:
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            //#pragma HLS PIPELINE II=1
-
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                //#pragma HLS UNROLL
-                out_data[k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth1:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            //#pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                //#pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-            res.write(out_data);
-        }
-    ConcatLoopWidth2:
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            //#pragma HLS PIPELINE II=1
-
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                //#pragma HLS UNROLL
-                out_data[k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            //#pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                //#pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                //#pragma HLS UNROLL
-                out_data[input1_T::size + k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        //#pragma HLS PIPELINE II=1
-
-        input1_T in_data1 = data1.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput1:
-        for (int k = 0; k < input1_T::size; k++) {
-            //#pragma HLS UNROLL
-            out_data[k] = in_data1[k];
-        }
-
-        res.write(out_data);
-    }
-ConcatLoopHeight2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        //#pragma HLS PIPELINE II=1
-
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput2:
-        for (int k = 0; k < input2_T::size; k++) {
-            //#pragma HLS UNROLL
-            out_data[k] = in_data2[k];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        //#pragma HLS PIPELINE II=1
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput1:
-        for (int k = 0; k < input1_T::size; k++) {
-            //#pragma HLS UNROLL
-            out_data[k] = in_data1[k];
-        }
-
-    ConcatPackInput2:
-        for (int k = 0; k < input2_T::size; k++) {
-            //#pragma HLS UNROLL
-            out_data[input1_T::size + k] = in_data2[k];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    res_T out_data;
-    PRAGMA_DATA_PACK(out_data)
-ConcatLoop1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
-        //#pragma HLS PIPELINE
-        input1_T in_data1 = data1.read();
-    ConcatPack1:
-        for (int j = 0; j < input1_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j + (i * input1_T::size)] = in_data1[j];
-        }
-    }
-ConcatLoop2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
-        //#pragma HLS PIPELINE
-        input2_T in_data2 = data2.read();
-    ConcatPack2:
-        for (int j = 0; j < input2_T::size; j++) {
-            //#pragma HLS UNROLL
-            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j];
-        }
-    }
-    res.write(out_data);
-}
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak
deleted file mode 100644
index 24979806df..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h.bak
+++ /dev/null
@@ -1,116 +0,0 @@
-#ifndef NNET_MULT_H_
-#define NNET_MULT_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <iostream>
-#include <math.h>
-
-namespace nnet {
-
-namespace product {
-
-/* ---
- * different methods to perform the product of input and weight, depending on the
- * types of each.
- * --- */
-
-class Product {};
-
-template <class x_T, class w_T> class both_binary : public Product {
-  public:
-    static x_T product(x_T a, w_T w) {
-        // specialisation for 1-bit weights and incoming data
-        //#pragma HLS INLINE
-        return a == w;
-    }
-};
-
-template <class x_T, class w_T> class weight_binary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 1-bit weights, arbitrary data
-        //#pragma HLS INLINE
-        if (w == 0)
-            return -a;
-        else
-            return a;
-    }
-};
-
-template <class x_T, class w_T> class data_binary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-w) {
-        // Specialisation for 1-bit data, arbitrary weight
-        //#pragma HLS INLINE
-        if (a == 0)
-            return -w;
-        else
-            return w;
-    }
-};
-
-template <class x_T, class w_T> class weight_ternary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 2-bit weights, arbitrary data
-        //#pragma HLS INLINE
-        if (w == 0)
-            return 0;
-        else if (w == -1)
-            return -a;
-        else
-            return a; // if(w == 1)
-    }
-};
-
-template <class x_T, class w_T> class mult : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(a * w) {
-        // 'Normal' product
-        //#pragma HLS INLINE
-        return a * w;
-    }
-};
-
-template <class x_T, class w_T> class weight_exponential : public Product {
-  public:
-    using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
-    static r_T product(x_T a, w_T w) {
-        // Shift product for exponential weights
-        //#pragma HLS INLINE
-
-        // Shift by the exponent. Negative weights shift right
-        r_T y = static_cast<r_T>(a) << w.weight;
-
-        // Negate or not depending on weight sign
-        return w.sign == 1 ? y : static_cast<r_T>(-y);
-    }
-};
-
-} // namespace product
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ap_uint<1>>::value &&
-                                   std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value,
-                               ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
-cast(typename CONFIG_T::accum_t x) {
-    return (ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<
-    std::is_same<data_T, ap_uint<1>>::value && !std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value, res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return (res_T)x;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<(!std::is_same<data_T, ap_uint<1>>::value), res_T>::type cast(typename CONFIG_T::accum_t x) {
-    return (res_T)x;
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak
deleted file mode 100644
index 20be3c74d6..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h.bak
+++ /dev/null
@@ -1,337 +0,0 @@
-#ifndef NNET_MHT_H_
-#define NNET_MHT_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include "nnet_mult.h"
-#include <iostream>
-#include <math.h>
-
-namespace nnet {
-
-struct multiheadattention_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned num_heads = 10;
-    static const unsigned head_dim_key = 10;
-    static const unsigned head_dim_value = 10;
-    static const unsigned feature_dim = 20;
-    static const unsigned seq_len = 500;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned strategy = latency;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <int PackSize, class data_T> struct datapack { data_T data[PackSize]; };
-
-template <class data_T, int size> void read_stream_array(hls::stream<data_T> data_in[size], data_T out[size]) {
-    for (int k = 0; k < size; ++k) {
-        //#pragma HLS UNROLL
-        out[k] = data_in[k].read();
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void matrixmul_transpose(hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
-                         hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
-                         res_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
-{
-    const data_T dk = 1.0 / sqrt(CONFIG_T::head_dim_key);
-    data_T QK_1;
-    typename CONFIG_T::accum_t QKij;
-    data_T Qi[CONFIG_T::head_dim_key];
-    data_T Product[CONFIG_T::seq_len]; // seq_Q, seq_K
-    data_T qk_smout[CONFIG_T::seq_len];
-    data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
-    //#pragma HLS ARRAY_PARTITION variable=Qi complete
-    //#pragma HLS ARRAY_PARTITION variable=Product complete
-    //#pragma HLS ARRAY_PARTITION variable=qk_smout complete
-    //#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
-    //#pragma HLS ARRAY_PARTITION variable=krow complete
-
-    datapack<CONFIG_T::head_dim_key, data_T> datak_pack, dataq_pack;
-    //#pragma HLS DATA_PACK variable=Q
-    //#pragma HLS DATA_PACK variable=K
-    //#pragma HLS DATA_PACK variable=datak_pack
-    //#pragma HLS DATA_PACK variable=dataq_pack
-
-    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
-
-prep_k:
-    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        datak_pack = K.read();
-        for (int j = 0; j < CONFIG_T::head_dim_key; ++j) {
-            //#pragma HLS UNROLL
-            krow[i * CONFIG_T::head_dim_key + j] = datak_pack.data[j];
-        }
-    }
-
-// for each row and column of AB
-row:
-    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        dataq_pack = Q.read();
-
-    q:
-        for (int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i) {
-            //#pragma HLS UNROLL
-            Qi[q_i] = dataq_pack.data[q_i];
-        }
-    col:
-        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-            // compute (QK)i,j
-            QKij = 0;
-        product:
-            for (int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-                QK_1 = CONFIG_T::template product<data_T, data_T>::product(Qi[k], krow[j * CONFIG_T::head_dim_key + k]);
-                QKij += QK_1;
-            }
-            Product[j] = QKij * dk;
-        }
-        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, qk_smout);
-        for (int n = 0; n < CONFIG_T::seq_len; ++n) {
-            //#pragma HLS UNROLL
-            QK[i][n] = qk_smout[n];
-        }
-    }
-}
-
-/////////
-template <class data_T, class res_T, typename CONFIG_T>
-void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &V,
-               hls::stream<data_T> S[CONFIG_T::head_dim_value]) // S: attention score
-{
-    //#pragma HLS DATA_PACK variable=V
-    //#pragma HLS ARRAY_PARTITION variable=QK complete dim=2
-    //#pragma HLS ARRAY_PARTITION variable=S complete dim=1
-
-    datapack<CONFIG_T::head_dim_key, data_T> datav_pack;
-    //#pragma HLS DATA_PACK variable=datav_pack
-
-    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
-
-    data_T dataV[CONFIG_T::seq_len * CONFIG_T::head_dim_value];
-    //#pragma HLS ARRAY_PARTITION variable = dataV complete dim = 1
-
-    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-        //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        datav_pack = V.read();
-        for (int i = 0; i < CONFIG_T::head_dim_value; ++i) {
-            //#pragma HLS UNROLL
-            dataV[CONFIG_T::seq_len * i + j] = datav_pack.data[i];
-        }
-    }
-
-    // for each row and column of AB
-    data_T Sij, S_1;
-    data_T QKi[CONFIG_T::seq_len];
-    //#pragma HLS ARRAY_Partition variable=QKi complete
-row:
-    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    qk:
-        for (int q_i = 0; q_i < CONFIG_T::seq_len; ++q_i) {
-            //#pragma HLS UNROLL
-            QKi[q_i] = QK[i][q_i];
-        }
-    col:
-        for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-            // compute (S)i,j
-            Sij = 0;
-        product:
-            for (int k = 0; k < CONFIG_T::seq_len; ++k) {
-                S_1 = CONFIG_T::template product<data_T, data_T>::product(QKi[k], dataV[j * CONFIG_T::seq_len + k]);
-                Sij += S_1;
-            }
-            S[j].write(Sij);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lin_projection(hls::stream<data_T> data_q[CONFIG_T::feature_dim], hls::stream<data_T> data_vk[CONFIG_T::feature_dim],
-                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &k_proj,
-                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &q_proj,
-                    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> &v_proj,
-                    typename CONFIG_T::weight_t key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-                    typename CONFIG_T::bias_t key_bias[CONFIG_T::head_dim_key],
-                    typename CONFIG_T::weight_t query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-                    typename CONFIG_T::bias_t query_bias[CONFIG_T::head_dim_key],
-                    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
-                    typename CONFIG_T::bias_t value_bias[CONFIG_T::head_dim_value])
-
-{
-    //#pragma HLS DATA_PACK variable=k_proj
-    //#pragma HLS DATA_PACK variable=q_proj
-    //#pragma HLS DATA_PACK variable=v_proj
-
-    //#pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
-    //#pragma HLS ARRAY_PARTITION variable=data_vk complete dim=1
-
-k_h:
-    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-        //#pragma HLS PIPELINE
-
-        data_T proj_k[CONFIG_T::head_dim_key];
-        data_T proj_q[CONFIG_T::head_dim_key];
-        data_T proj_v[CONFIG_T::head_dim_value];
-        data_T in_q[CONFIG_T::feature_dim];
-        data_T in_v[CONFIG_T::feature_dim];
-        //#pragma HLS ARRAY_PARTITION variable=proj_k complete dim=1
-        //#pragma HLS ARRAY_PARTITION variable=proj_q complete dim=1
-        //#pragma HLS ARRAY_PARTITION variable=proj_v complete dim=1
-        //#pragma HLS ARRAY_PARTITION variable=in_q complete dim=1
-        //#pragma HLS ARRAY_PARTITION variable=in_v complete dim=1
-
-        datapack<CONFIG_T::head_dim_key, data_T> proj_k_pack;
-        datapack<CONFIG_T::head_dim_key, data_T> proj_q_pack;
-        datapack<CONFIG_T::head_dim_value, data_T> proj_v_pack;
-        //#pragma HLS DATA_PACK variable=proj_k_pack
-        //#pragma HLS DATA_PACK variable=proj_q_pack
-        //#pragma HLS DATA_PACK variable=proj_v_pack
-
-        read_stream_array<data_T, CONFIG_T::feature_dim>(data_q, in_q);
-        read_stream_array<data_T, CONFIG_T::feature_dim>(data_vk, in_v);
-
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_k_pack.data, key_weight, key_bias);
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_q, proj_q_pack.data, query_weight, query_bias);
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_v_pack.data, value_weight, value_bias);
-
-        k_proj.write(proj_k_pack);
-        q_proj.write(proj_q_pack);
-        v_proj.write(proj_v_pack);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_out(hls::stream<data_T> data_in[CONFIG_T::num_heads][CONFIG_T::head_dim_value],
-               res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-               typename CONFIG_T::weight_t
-                   attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],
-               typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim]) {
-    data_T mat_res_con[CONFIG_T::num_heads * CONFIG_T::head_dim_value];
-    res_T dense_out[CONFIG_T::feature_dim];
-//#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
-//#pragma HLS ARRAY_PARTITION variable=dense_out complete dim=1
-output_dense:
-    for (int k = 0; k < CONFIG_T::seq_len; ++k) {
-
-        //#pragma HLS PIPELINE
-        for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-            //#pragma HLS UNROLL
-            for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-                //#pragma HLS UNROLL
-                mat_res_con[CONFIG_T::head_dim_value * i + j] = data_in[i][j].read();
-            }
-        }
-        dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, dense_out, attention_output_weight,
-                                                              attention_output_bias);
-        for (int i = 0; i < CONFIG_T::feature_dim; ++i) {
-            //#pragma HLS UNROLL
-            res[CONFIG_T::feature_dim * k + i] = dense_out[i];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void data_prep(data_T data[CONFIG_T::seq_len * CONFIG_T::feature_dim], hls::stream<data_T> d[CONFIG_T::feature_dim]) {
-    //#pragma HLS ARRAY_PARTITION variable=d complete dim=1
-    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-        for (int k = 0; k < CONFIG_T::feature_dim; ++k) {
-            //#pragma HLS UNROLL
-            d[k].write(data[j * CONFIG_T::feature_dim + k]);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void multiheadattention(
-    data_T data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim], data_T data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value *
-                                                        CONFIG_T::feature_dim], // num_heads,head_size_v,dim
-    typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t
-        key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // n_head,dim,head_dim
-    typename CONFIG_T::bias_t key_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t
-        query_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // same shape as key
-    typename CONFIG_T::bias_t query_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
-    typename CONFIG_T::bias_t value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value]) {
-    hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-    hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> q_proj[CONFIG_T::num_heads];
-    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> k_proj[CONFIG_T::num_heads];
-    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> v_proj[CONFIG_T::num_heads];
-    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
-    hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
-
-    //#pragma HLS DATAFLOW
-    //#pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
-    //#pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
-    //#pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
-    //#pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
-    //#pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
-    //#pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
-    // std::cout << "input to MHA: " << std::endl;
-    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
-    // std::cout << " " << std::endl;
-
-prepq:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        //#pragma HLS UNROLL
-        nnet::data_prep<data_T, res_T, CONFIG_T>(data_q, d_query[i]);
-    }
-prepvk:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        //#pragma HLS UNROLL
-        nnet::data_prep<data_T, res_T, CONFIG_T>(data_vk, d_value[i]);
-    }
-
-// linear projection
-lin_proj:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        //#pragma HLS UNROLL
-        nnet::lin_projection<data_T, res_T, CONFIG_T>(
-            d_query[i], d_value[i], k_proj[i], q_proj[i], v_proj[i],
-            key_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), key_bias + (CONFIG_T::head_dim_key * i),
-            query_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), query_bias + (CONFIG_T::head_dim_key * i),
-            value_weight + (CONFIG_T::head_dim_value * CONFIG_T::feature_dim * i),
-            value_bias + (CONFIG_T::head_dim_value * i));
-    }
-
-maxtrixmul1:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        //#pragma HLS UNROLL
-        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
-    }
-
-maxtrixmul2:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        //#pragma HLS UNROLL
-        nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]); // stream
-    }
-
-    nnet::dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
-    //    std::cout << "out MHA: " << std::endl;
-    //    nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(res, std::cout);
-    //    std::cout << " " << std::endl;
-}
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak
deleted file mode 100644
index d069cc3f5b..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_padding.h.bak
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifndef NNET_PADDING_H_
-#define NNET_PADDING_H_
-
-#include <math.h>
-
-namespace nnet {
-
-struct padding1d_config {
-    static const unsigned n_chan = 10;
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    //#pragma HLS PIPELINE
-
-    for (int j = 0; j < CONFIG_T::n_chan; j++) {
-        for (int i = 0; i < CONFIG_T::pad_left; i++) {
-            *(res++) = 0;
-        }
-
-        for (int i = 0; i < CONFIG_T::in_width; i++) {
-            *(res++) = (res_T) * (data++);
-        }
-
-        for (int i = 0; i < CONFIG_T::pad_right; i++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    //#pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::pad_left; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_width; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = (res_T) * (data++);
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_right; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-struct padding2d_config {
-    static const unsigned n_chan = 10;
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    //#pragma HLS PIPELINE
-
-    for (int k = 0; k < CONFIG_T::n_chan; k++) {
-
-        for (int i = 0; i < CONFIG_T::pad_top; i++) {
-            for (int j = 0; j < CONFIG_T::out_width; j++) {
-                *(res++) = 0;
-            }
-        }
-
-        for (int i = 0; i < CONFIG_T::in_height; i++) {
-            for (int j = 0; j < CONFIG_T::pad_left; j++) {
-                *(res++) = 0;
-            }
-            for (int j = 0; j < CONFIG_T::in_width; j++) {
-                *(res++) = (res_T) * (data++);
-            }
-            for (int j = 0; j < CONFIG_T::pad_right; j++) {
-                *(res++) = 0;
-            }
-        }
-
-        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-            for (int j = 0; j < CONFIG_T::out_width; j++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    //#pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::pad_top; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_height; i++) {
-        for (int j = 0; j < CONFIG_T::pad_left; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-        for (int j = 0; j < CONFIG_T::in_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = (res_T) * (data++);
-            }
-        }
-        for (int j = 0; j < CONFIG_T::pad_right; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak
deleted file mode 100644
index 4611175a68..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h.bak
+++ /dev/null
@@ -1,313 +0,0 @@
-#ifndef NNET_POOLING_H_
-#define NNET_POOLING_H_
-
-#include "nnet_helpers.h"
-#include <iostream>
-
-namespace nnet {
-
-// Return the maximum value from an array
-template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
-    T y = x[0];
-    for (int i = 1; i < N; i++) {
-        y = x[i] > y ? x[i] : y;
-    }
-    return y;
-}
-
-// Return the mean value of an array
-template <typename T, int N, typename accum_t> accum_t avg(T (&x)[N], unsigned length) {
-    accum_t y = 0;
-    for (int i = 0; i < N; i++) {
-        y += x[i];
-    }
-    y /= length;
-    return y;
-}
-
-// Enumeration for pooling operation (max, avg, l2norm pooling)
-enum Pool_Op { Max, Average }; // L2Norm };
-template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N], unsigned length) {
-    switch (op) {
-    case Max:
-        return max<T, N, accum_t>(x);
-    case Average:
-        return avg<T, N, accum_t>(x, length);
-        // case L2Norm: return l2norm<T, N>(x);
-    }
-}
-
-template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
-    return pool_op<T, N, op, accum_t>(x, N);
-}
-
-template <typename T, Pool_Op op> T pad_val() {
-    /*---
-     *- In Tensorflow, pooling ignores the value in the padded cells
-     *- For Avg pooling, return 0 (the divisior is modified to the
-     *- area overlapping the unpadded image.
-     *- For max pooling, return the most negative value for the type.
-     *- TODO this is not really generic, it assumes fixed point or integer T
-    ---*/
-    switch (op) {
-    case Max: {
-        T x = 0;
-        x[x.width - 1] = 1;
-        return x;
-        break;
-    }
-    case Average:
-        return 0;
-    }
-}
-
-struct pooling1d_config {
-    // IO size
-    static const unsigned n_in = 10;
-    static const unsigned pool_width = 2;
-    static const unsigned stride_width = 2;
-    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const bool count_pad = false;
-    // Pooling function
-    static const Pool_Op pool_op = Max;
-};
-
-template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
-    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit_1d<CONFIG_T>();
-    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-    // Add any necessary padding
-
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Loop over input image x in steps of stride
-        for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) {
-            unsigned overlap_pixel = 0;
-            data_T pool[CONFIG_T::pool_width];
-            //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-
-            for (int jj = 0; jj < CONFIG_T::pool_width; jj++) {
-                if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) {
-                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
-                    overlap_pixel++;
-                } else
-                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
-            }
-
-            int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel;
-
-            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
-                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool, patch_size);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit_1d<CONFIG_T>();
-    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        data_T pool[CONFIG_T::n_in];
-        //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
-            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
-        }
-        // do the pooling
-        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool);
-    }
-}
-
-struct pooling2d_config {
-    // IO size
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned n_filt = 4;
-    static const unsigned stride_height = 2;
-    static const unsigned stride_width = 2;
-    static const unsigned pool_height = 2;
-    static const unsigned pool_width = 2;
-    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
-    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
-    // Padding
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const bool count_pad = false;
-    // Pooling function
-    static const Pool_Op pool_op = Max;
-    // Reuse factor
-    static const unsigned reuse_factor = 1;
-
-    // Internal data type definitions
-    typedef float accum_t;
-};
-
-template <typename CONFIG_T> constexpr int pool_op_limit() {
-    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit<CONFIG_T>();
-    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-
-        // Loop over input image y in steps of stride
-        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
-            // Loop over input image x in steps of stride
-            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
-                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
-                //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-
-                unsigned overlap_pixel = 0;
-
-                // Loop over pool window y
-                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
-                    // Loop over pool window x
-                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
-                        bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top;
-                        bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left;
-                        if (cond1 && cond2) {
-                            unsigned data_idx =
-                                ((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) *
-                                    CONFIG_T::n_filt +
-                                ff;
-                            pool[kk * CONFIG_T::stride_width + ll] = data[data_idx];
-                            overlap_pixel++;
-                        } else
-                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
-                    }
-                }
-
-                int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel;
-
-                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
-                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
-                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
-                            typename CONFIG_T::accum_t>(pool, patch_size);
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit<CONFIG_T>();
-    //#pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Loop over input image y in steps of stride
-        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
-            // Loop over input image x in steps of stride
-            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
-                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
-                //#pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-                // Keep track of number of pixels in image vs padding region
-                unsigned img_overlap = 0;
-                // Loop over pool window y
-                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
-                    // Loop over pool window x
-                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
-                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) ||
-                            jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) {
-                            // Add padding
-                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
-                            if (CONFIG_T::count_pad)
-                                img_overlap++;
-                        } else {
-                            pool[kk * CONFIG_T::stride_width + ll] =
-                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
-                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
-                            img_overlap++;
-                        }
-                    }
-                }
-                // do the pooling
-                // TODO in the case of average pooling, need to reduce height * width to area of pool window
-                // not overlapping padding region
-                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
-                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
-                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
-                            typename CONFIG_T::accum_t>(pool);
-                // If the pool op is Average, the zero-padding needs to be removed from the results
-                if (CONFIG_T::pool_op == Average) {
-                    data_T rescale =
-                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
-                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
-                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
-                }
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                         res_T res[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
-    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
-    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
-
-    //#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    const int limit = pool_op_limit<CONFIG_T>();
-    //#pragma HLS ALLOCATION instances=pool_op limit=limit function
-
-FiltLoop:
-    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
-        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
-
-    InputLoop:
-        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
-            pool[i] = data[i * CONFIG_T::n_filt + filt];
-        }
-
-        res[filt] = static_cast<res_T>(
-            pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak
deleted file mode 100644
index 3e1ebb225d..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h.bak
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef NNET_RECR_ACTIVATION_H_
-#define NNET_RECR_ACTIVATION_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <math.h>
-
-namespace nnet {
-
-namespace activation {
-
-template <class data_T, class res_T, typename CONFIG_T> class Activation {
-  public:
-    // *************************************************
-    //       Blank Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       Relu Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       Sigmoid Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       TanH Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-} // namespace activation
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak
deleted file mode 100644
index 5ccf2ee570..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h.bak
+++ /dev/null
@@ -1,586 +0,0 @@
-#ifndef NNET_RECURSIVE_H_
-#define NNET_RECURSIVE_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include "nnet_recr_activations.h"
-
-namespace nnet {
-
-struct lstm_config {
-    // Internal data type definitions
-    typedef float weight_t;
-    typedef float bias_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 2;
-    static const unsigned n_parts = 20;
-    static const unsigned n_out = 2;
-    static const unsigned n_state = 2;
-    static const unsigned n_4state = 8;
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-    static const bool store_weights_in_bram = false;
-    static const bool use_static = true;
-
-    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
-    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
-};
-// Long Short term Memory NN (LSTM)
-// Resources:
-// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
-// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
-// https://en.wikipedia.org/wiki/Long_short-term_memory
-// Notes:
-//  - LSTM naming conventions adopted from the above links
-//      - s_newstate = activation(U*input + W*state)
-//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
-//  - If softmax is needed on output, perform *outside* this operations
-//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
-//  dense network at the end)
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-    // Initialize the state variable -- will maintain state between function calls
-
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
-    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
-
-    //#pragma HLS ARRAY_PARTITION variable=h_newstate   complete
-    //#pragma HLS ARRAY_PARTITION variable=s_newstate   complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres       complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
-    //#pragma HLS ARRAY_PARTITION variable=s_actstate   complete
-
-    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
-
-    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc;
-        if (iacc > 2 * CONFIG_T::n_state - 1)
-            index = iacc + CONFIG_T::n_state;
-        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
-    }
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
-    }
-
-    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
-        inputacc_ifo, tmpres_ifo);
-
-    // Now for the confusion matrix
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        inputacc_c, tmpres_c);
-
-    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
-    }
-    // Operation: h=act(s)*o
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        s_newstate, s_actstate);
-
-    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
-        //#pragma HLS UNROLL
-        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-                 res_T s_newstate[CONFIG_T::n_state],
-                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-    static res_T h_state[CONFIG_T::n_state];
-    static res_T s_state[CONFIG_T::n_state];
-    // Initialize the state variable -- will maintain state between function calls
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
-    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
-
-    //#pragma HLS ARRAY_PARTITION variable=h_newstate   complete
-    //#pragma HLS ARRAY_PARTITION variable=s_newstate   complete
-    //#pragma HLS ARRAY_PARTITION variable=h_state      complete
-    //#pragma HLS ARRAY_PARTITION variable=s_state      complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres       complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_state complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
-    //#pragma HLS ARRAY_PARTITION variable=s_actstate   complete
-
-    if (reset_state) {
-        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
-            //#pragma HLS UNROLL
-            s_state[i_state] = 0;
-            h_state[i_state] = 0;
-        }
-    }
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
-                                                                                    param_br);
-
-    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc;
-        if (iacc > 2 * CONFIG_T::n_state - 1)
-            index = iacc + CONFIG_T::n_state;
-        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
-    }
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
-    }
-
-    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
-        inputacc_ifo, tmpres_ifo);
-
-    // Now for the confusion matrix
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        inputacc_c, tmpres_c);
-
-    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
-        s_newstate[iacc] = s_state[iacc];
-    }
-    // Operation: h=act(s)*o
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        s_state, s_actstate);
-
-    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
-        //#pragma HLS UNROLL
-        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
-        h_newstate[iacc] = h_state[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-
-    res_T h_newstate[CONFIG_T::n_state];
-    res_T s_newstate[CONFIG_T::n_state];
-    data_T data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    //#pragma HLS ARRAY_PARTITION variable=s_newstate complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        //#pragma HLS UNROLL
-        h_newstate[ii] = 0;
-        s_newstate[ii] = 0;
-    }
-    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
-        for (int j = 0; j < CONFIG_T::n_in; j++) {
-            //#pragma HLS UNROLL
-            data_in[j] = data[j + iloop * CONFIG_T::n_in];
-        }
-        if (CONFIG_T::use_static)
-            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
-                                                       param_br);
-        else
-            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
-                                                param_br);
-        if (CONFIG_T::n_sequence_out > 1)
-            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
-                //#pragma HLS UNROLL
-                res[i] = h_newstate[j];
-            }
-        reset_state = false;
-    }
-    if (CONFIG_T::n_sequence_out == 1)
-        for (int i = 0; i < (CONFIG_T::n_state); i++) {
-            //#pragma HLS UNROLL
-            res[i] = h_newstate[i];
-        }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-
-    typename res_T::value_type h_newstate[CONFIG_T::n_state];
-    typename res_T::value_type s_newstate[CONFIG_T::n_state];
-    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    //#pragma HLS ARRAY_PARTITION variable=s_newstate complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        //#pragma HLS UNROLL
-        h_newstate[ii] = 0;
-        s_newstate[ii] = 0;
-    }
-
-    typename data_T::value_type data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-DataPropagation:
-    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
-        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
-            // //#pragma HLS PIPELINE
-        }
-        data_T data_pack = data_stream.read();
-    DataPack:
-        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
-            //#pragma HLS UNROLL
-            data_in[i_pack] = data_pack[i_pack];
-        }
-        if (CONFIG_T::use_static)
-            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
-        else
-            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1) {
-            res_T res_pack;
-            PRAGMA_DATA_PACK(res_pack)
-        ResPack_sequences:
-            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-                //#pragma HLS UNROLL
-                res_pack[i_pack] = h_newstate[i_pack];
-            }
-            res_stream.write(res_pack);
-        }
-        reset_state = false;
-    }
-
-    if (CONFIG_T::n_sequence_out == 1) {
-        res_T res_pack;
-        PRAGMA_DATA_PACK(res_pack)
-    ResPack:
-        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-            //#pragma HLS UNROLL
-            res_pack[i_pack] = h_newstate[i_pack];
-        }
-        res_stream.write(res_pack);
-    }
-}
-
-// Struct for the GRU template
-
-struct gru_config {
-    // Internal data type definitions
-    typedef float weight_t;
-    typedef float bias_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 2;
-    static const unsigned n_out = 2;
-    static const unsigned n_state = 2;
-    static const unsigned n_sequence = 2;
-    static const unsigned n_4state = 8;
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const bool use_static = true;
-    static const bool pytorch_order = false;
-    static const unsigned n_zeros = 0;
-
-    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
-    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
-                                                                                    // weights - refer page in copy!!
-         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-    // Initialize the state variable -- will maintain state between function calls
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
-
-    //#pragma HLS ARRAY_PARTITION variable=h_newstate      complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres          complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
-                                                                                    param_br);
-
-    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
-    // initialized with biases -- DONE
-    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc;
-        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
-    }
-
-    // Activation function Sub layer -- START
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
-
-    // Activation function Sub layer -- END
-
-    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-        else
-            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-    }
-
-    // Assuming reset_after is false
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
-    }
-
-    // Now run the activation on this guy
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
-
-    // Mix the stat with the previous state
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
-                                       h_newstate[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
-        else
-            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-    // Initialize the state variable -- will maintain state between function calls
-
-    static res_T h_state[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
-
-    //#pragma HLS ARRAY_PARTITION variable=h_state         complete
-    //#pragma HLS ARRAY_PARTITION variable=h_newstate      complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres          complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
-    //#pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
-    //#pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
-
-    if (reset_state) {
-        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
-            //#pragma HLS UNROLL
-            h_state[i_h_state] = 0;
-        }
-    }
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
-                                                                                    param_br);
-
-    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
-    // initialized with biases -- DONE
-    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc;
-        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
-    }
-
-    // Activation function Sub layer -- START
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
-
-    // Activation function Sub layer -- END
-
-    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-        else
-            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-    }
-
-    // Assuming reset_after is false
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
-    }
-
-    // Now run the activation on this guy
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
-
-    // Mix the stat with the previous state
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        //#pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
-                                    h_state[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
-        else
-            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
-        h_newstate[iacc] = h_state[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
-               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-
-    res_T h_state[CONFIG_T::n_state];
-    data_T data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-    //#pragma HLS ARRAY_PARTITION variable=h_state complete
-    //#pragma HLS ARRAY_PARTITION variable=data_in complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        //#pragma HLS UNROLL
-        h_state[ii] = 0;
-    }
-    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
-        for (int j = 0; j < CONFIG_T::n_in; j++) {
-            //#pragma HLS UNROLL
-            data_in[j] = data[j + iloop * CONFIG_T::n_in];
-        }
-        if (CONFIG_T::use_static)
-            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
-        else
-            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1)
-            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
-                //#pragma HLS UNROLL
-                res[i] = h_state[j];
-            }
-        reset_state = false;
-    }
-    if (CONFIG_T::n_sequence_out == 1)
-        for (int i = 0; i < (CONFIG_T::n_state); i++) {
-            //#pragma HLS UNROLL
-            res[i] = h_state[i];
-        }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
-               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-
-    typename res_T::value_type h_newstate[CONFIG_T::n_state];
-    //#pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        //#pragma HLS UNROLL
-        h_newstate[ii] = 0;
-    }
-
-    typename data_T::value_type data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-DataPropagation:
-    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
-        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
-            // //#pragma HLS PIPELINE
-        }
-        data_T data_pack = data_stream.read();
-    DataPack:
-        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
-            //#pragma HLS UNROLL
-            data_in[i_pack] = data_pack[i_pack];
-        }
-        if (CONFIG_T::use_static)
-            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
-        else
-            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
-                                                                                         param, param_zr, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1) {
-            res_T res_pack;
-            PRAGMA_DATA_PACK(res_pack)
-        ResPack_sequences:
-            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-                //#pragma HLS UNROLL
-                res_pack[i_pack] = h_newstate[i_pack];
-            }
-            res_stream.write(res_pack);
-        }
-        reset_state = false;
-    }
-
-    if (CONFIG_T::n_sequence_out == 1) {
-        res_T res_pack;
-        PRAGMA_DATA_PACK(res_pack)
-    ResPack:
-        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-            //#pragma HLS UNROLL
-            res_pack[i_pack] = h_newstate[i_pack];
-        }
-        res_stream.write(res_pack);
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/test/docker/README.md b/test/docker/README.md
deleted file mode 100644
index 0446228a31..0000000000
--- a/test/docker/README.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Docker image for hls4ml with Vivado
-
-Extract the Vivado installation archive here and provide the path to license server.
-For example:
-
-```
-docker build --network=host -t hls4ml-with-vivado --build-arg LICENSE_SERVER="1234@myserver" .
-```
-
-By default, version 2018.2 of Vivado is used.
-
-## Using other versions of Vivado
-
-To install specific version of Vivado, first generate the silent installation configuration file from the Vivado installation folder by running:
-
-```
-./xsetup -b configGen
-```
-
-Choose the products/devices the you would like to install by modifying the generated file. Name the file `install_config.txt` and place it in the directory with `Dockerfile`. Edit the `Dockerfile` to add the folder of your Vivado installation and build the image using the command provided above.
-
-## Using the created image
-
-The image can be used in a standard way, e.g., with [`docker run`](https://docs.docker.com/engine/reference/commandline/run/) command:
-
-```
-docker run -it --rm hls4ml-with-vivado
-```
-
-Alternatively, a reusable container can be created and later accessed with [`docker exec`](https://docs.docker.com/engine/reference/commandline/exec/) command:
-
-```
-docker run -dit --name my-hls4ml-container hls4ml-with-vivado
-docker exec -it my-hls4ml-container bash
-```
-
-We recommend using docker volumes to mount the local filesystem into the docker container in order to access files on host from the image.
-
-```
-docker run -it --rm -v /path/on/host:/home/hls4ml/path/in/container hls4ml-with-vivado
-```
-
-Consult the docker [documentation](https://docs.docker.com/storage/volumes/) for more information about volumes.
-
-## GUI support
-
-By default, the image is built without X11 libraries needed to launch Vivado HLS GUI. To add GUI support, pass `--build-arg GUI_SUPPORT=1` to the build command. For example:
-
-```
-docker build --network=host -t hls4ml-with-vivado --build-arg LICENSE_SERVER="1234@myserver" --build-arg GUI_SUPPORT=1 .
-```
-
-To launch GUI apps in Docker container, map `/tmp/.X11-unix` and `DISPLAY` environment variable from host to the container, e.g.,
-
-```
-docker run -it -e DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix hls4ml-with-vivado
-```
-
-If your X11 session requires a valid user, `Xauthority` file must be mapped into the container. This file is either in the user's home directory (`$HOME/.Xauthority`) or its location is spcified in the `XAUTHORITY` environment variable. For example:
-
-```
-docker run -it -e DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix -v $HOME/.Xauthority:/home/hls4ml/.Xauthority hls4ml-with-vivado
-```
-
-## Customizing the default user
-
-Default user (named *hls4ml*) cah have its *id* and *group* changed to match a specific user on host machine with `USER_ID` and `GROUP_ID` build arguments. Useful if you want to add a shared volume. For example:
-
-```
-docker build --network=host -t hls4ml-with-vivado --build-arg LICENSE_SERVER="1234@myserver" --build-arg USER_ID=`id -u` --build-arg GROUP_ID=`id -g` .
-```
-
diff --git a/test/docker/install_config-2017.2.txt b/test/docker/install_config-2017.2.txt
deleted file mode 100644
index 58e7efc258..0000000000
--- a/test/docker/install_config-2017.2.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-#### Vivado HL Design Edition Install Configuration ####
-Edition=Vivado HL Design Edition
-
-# Path where Xilinx software will be installed.
-Destination=/opt/Xilinx
-
-# Choose the Products/Devices the you would like to install.
-Modules=Zynq UltraScale+ MPSoC:0,Software Development Kit (SDK):0,DocNav:0,Kintex UltraScale:1,Engineering Sample Devices:0,Kintex-7:1,Virtex UltraScale+:0,Zynq-7000:0,Kintex UltraScale+ ES:0,Kintex UltraScale+:0,Spartan-7:0,Zynq UltraScale+ RFSoC ES:0,Virtex-7:1,Virtex UltraScale:1,Virtex UltraScale+ ES:0,Zynq UltraScale+ MPSoC ES:0,Artix-7:0
-
-# Choose the post install scripts you'd like to run as part of the finalization step. Please note that some of these scripts may require user interaction during runtime.
-InstallOptions=Acquire or Manage a License Key:0,Enable WebTalk for SDK to send usage statistics to Xilinx:1,Enable WebTalk for Vivado to send usage statistics to Xilinx (Always enabled for WebPACK license):1
-
-## Shortcuts and File associations ##
-# Choose whether Start menu/Application menu shortcuts will be created or not.
-CreateProgramGroupShortcuts=1
-
-# Choose the name of the Start menu/Application menu shortcut. This setting will be ignored if you choose NOT to create shortcuts.
-ProgramGroupFolder=Xilinx Design Tools
-
-# Choose whether shortcuts will be created for All users or just the Current user. Shortcuts can be created for all users only if you run the installer as administrator.
-CreateShortcutsForAllUsers=0
-
-# Choose whether shortcuts will be created on the desktop or not.
-CreateDesktopShortcuts=0
-
-# Choose whether file associations will be created or not.
-CreateFileAssociation=0
-
diff --git a/test/docker/install_config.txt b/test/docker/install_config.txt
deleted file mode 100644
index 43e2e085b6..0000000000
--- a/test/docker/install_config.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-#### Vivado HL Design Edition Install Configuration ####
-Edition=Vivado HL Design Edition
-
-# Path where Xilinx software will be installed.
-Destination=/opt/Xilinx
-
-# Choose the Products/Devices the you would like to install.
-Modules=Zynq UltraScale+ MPSoC:0,DocNav:0,Kintex-7:1,Virtex UltraScale+:0,Virtex UltraScale+ HBM ES:0,Zynq-7000:0,Kintex UltraScale+:0,Model Composer:0,ARM Cortex-A53:0,Spartan-7:0,Zynq UltraScale+ RFSoC ES:0,Engineering Sample Devices:0,Kintex UltraScale:1,Virtex UltraScale:1,SDK Core Tools:1,Zynq UltraScale+ RFSoC:0,ARM Cortex-A9:0,ARM Cortex R5:0,Virtex-7:1,Virtex UltraScale+ 58G ES:0,Zynq UltraScale+ MPSoC ES:0,MicroBlaze:0,Artix-7:0
-
-# Choose the post install scripts you'd like to run as part of the finalization step. Please note that some of these scripts may require user interaction during runtime.
-InstallOptions=Acquire or Manage a License Key:0,Enable WebTalk for SDK to send usage statistics to Xilinx:1,Enable WebTalk for Vivado to send usage statistics to Xilinx (Always enabled for WebPACK license):1
-
-## Shortcuts and File associations ##
-# Choose whether Start menu/Application menu shortcuts will be created or not.
-CreateProgramGroupShortcuts=1
-
-# Choose the name of the Start menu/Application menu shortcut. This setting will be ignored if you choose NOT to create shortcuts.
-ProgramGroupFolder=Xilinx Design Tools
-
-# Choose whether shortcuts will be created for All users or just the Current user. Shortcuts can be created for all users only if you run the installer as administrator.
-CreateShortcutsForAllUsers=0
-
-# Choose whether shortcuts will be created on the desktop or not.
-CreateDesktopShortcuts=0
-
-# Choose whether file associations will be created or not.
-CreateFileAssociation=0
-
diff --git a/test/pytest/test_cnn_mnist.py b/test/pytest/test_cnn_mnist.py
deleted file mode 100644
index 58fcfeb7f4..0000000000
--- a/test/pytest/test_cnn_mnist.py
+++ /dev/null
@@ -1,93 +0,0 @@
-from pathlib import Path
-
-import numpy as np
-import pytest
-from sklearn.metrics import accuracy_score
-from tensorflow.keras.datasets import mnist
-from tensorflow.keras.layers import Activation, AveragePooling2D, Conv2D, Dense, Flatten, MaxPooling2D
-from tensorflow.keras.models import Sequential
-from tensorflow.keras.utils import to_categorical
-
-import hls4ml
-
-test_root_path = Path(__file__).parent
-
-
-@pytest.fixture(scope='module')
-def mnist_data():
-    (x_train, y_train), (x_test, y_test) = mnist.load_data()
-    x_train = x_train.astype("float32") / 255.0
-    x_test = x_test.astype("float32") / 255.0
-    x_train = np.expand_dims(x_train, -1)
-    x_test = np.expand_dims(x_test, -1)
-    y_train = to_categorical(y_train, 10)
-    y_test = to_categorical(y_test, 10)
-    x_test, y_test = x_test[:1000], y_test[:1000]
-    return x_train, y_train, x_test, y_test
-
-
-@pytest.fixture(scope='module')
-def keras_model(mnist_data):
-    # Aim of this model is to test different CNN paramaters, including:
-    # The common filter sizes, 3x3 and 5x5
-    # A non-power of 2 number of filters
-    # Both Average and Max Pooling
-    # Both Same and Valid Padding
-    x_train, y_train, x_test, y_test = mnist_data
-    keras_model = Sequential()
-    keras_model.add(Conv2D(4, (3, 3), input_shape=(28, 28, 1), padding='same'))
-    keras_model.add(Activation('relu'))
-    keras_model.add(MaxPooling2D())
-    keras_model.add(Conv2D(6, (5, 5), padding='valid'))
-    keras_model.add(Activation('relu'))
-    keras_model.add(AveragePooling2D())
-    keras_model.add(Flatten())
-    keras_model.add(Dense(10, kernel_initializer='lecun_uniform'))
-    keras_model.add(Activation('softmax', name='softmax'))
-    keras_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
-    keras_model.fit(x_train, y_train, batch_size=32, epochs=5, verbose=0)
-    return keras_model
-
-
-@pytest.mark.parametrize(
-    'backend,io_type,strategy',
-    [
-        ('Quartus', 'io_parallel', 'resource'),
-        ('Quartus', 'io_stream', 'resource'),
-        ('Vivado', 'io_parallel', 'resource'),
-        ('Vivado', 'io_parallel', 'latency'),
-        ('Vivado', 'io_stream', 'latency'),
-        ('Vivado', 'io_stream', 'resource'),
-        ('Vitis', 'io_parallel', 'resource'),
-        ('Vitis', 'io_parallel', 'latency'),
-        ('Vitis', 'io_stream', 'latency'),
-        ('Vitis', 'io_stream', 'resource'),
-    ],
-)
-def test_mnist_cnn(keras_model, mnist_data, backend, io_type, strategy):
-    x_train, y_train, x_test, y_test = mnist_data
-
-    hls_config = hls4ml.utils.config_from_keras_model(keras_model, granularity='name', backend=backend)
-    hls_config['Model']['Strategy'] = strategy
-    hls_config['LayerName']['softmax']['Implementation'] = 'stable'
-    output_dir = str(test_root_path / f'hls4mlprj_cnn_mnist_{backend}_{io_type}_{strategy}')
-
-    hls_model = hls4ml.converters.convert_from_keras_model(
-        keras_model, hls_config=hls_config, output_dir=output_dir, backend=backend, io_type=io_type
-    )
-    hls_model.compile()
-
-    # Model under test predictions and accuracy
-    y_keras = keras_model.predict(x_test)
-    y_hls4ml = hls_model.predict(x_test)
-
-    acc_keras = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_keras, axis=1))
-    acc_hls4ml = accuracy_score(np.argmax(y_test, axis=1), np.argmax(y_hls4ml, axis=1))
-    rel_diff = abs(acc_keras - acc_hls4ml) / acc_keras
-
-    print(f'Accuracy keras:      {acc_keras}')
-    print(f'Accuracy hls4ml:     {acc_hls4ml}')
-    print(f'Relative difference: {rel_diff}')
-
-    assert acc_keras > 0.95 and rel_diff < 0.03
-

From 20a0199ecdb1f17da47b850badab9128e605360d Mon Sep 17 00:00:00 2001
From: Jan-Frederik Schulte <jschulte@cern.ch>
Date: Fri, 13 Sep 2024 16:17:56 -0400
Subject: [PATCH 34/55] trying to clean the diff

---
 docs/status.rst | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/docs/status.rst b/docs/status.rst
index dc3a6d8f18..4ff4d33282 100644
--- a/docs/status.rst
+++ b/docs/status.rst
@@ -60,12 +60,6 @@ A summary of the on-going status of the ``hls4ml`` tool is in the table below.
      - ``supported``
      - ``supported``
      - ``experimental``
-   * - RNN (LSTM)
-     - ``supported``
-     - ``N/A``
-     - ``supported``
-     - ``supported``
-     - ``experimental``
    * - RNN (LSTM)
      - ``supported``
      - ``N/A``

From ddccde2025137af413bb9fa432009d1da45e75ff Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Tue, 17 Sep 2024 16:52:17 -0400
Subject: [PATCH 35/55] trying to clean the diff

---
 .github/ISSUE_TEMPLATE/bug_report.md          |    88 +-
 .github/ISSUE_TEMPLATE/config.yml             |    10 +-
 .github/ISSUE_TEMPLATE/feature_request.md     |    56 +-
 .../vivado/passes/broadcast_stream.py         |   234 +-
 .../vivado_accelerator/supported_boards.json  |    84 +-
 hls4ml/converters/keras/pooling.py            |   182 +-
 .../templates/quartus/ac_types/ac_channel.h   |  1110 +-
 .../templates/quartus/ac_types/ac_complex.h   |   890 +-
 hls4ml/templates/quartus/ac_types/ac_fixed.h  |  3092 ++--
 hls4ml/templates/quartus/ac_types/ac_float.h  |  2392 +--
 hls4ml/templates/quartus/ac_types/ac_int.h    |  6198 +++----
 hls4ml/templates/quartus/ac_types/ac_sc.h     |  1104 +-
 .../templates/quartus/ac_types/ac_std_float.h |  4636 ++---
 hls4ml/templates/quartus/ac_types/stream.h    |    70 +-
 hls4ml/templates/quartus/firmware/defines.h   |    94 +-
 .../templates/quartus/firmware/myproject.cpp  |    96 +-
 hls4ml/templates/quartus/firmware/myproject.h |    96 +-
 .../firmware/nnet_utils/nnet_batchnorm.h      |   208 +-
 .../quartus/firmware/nnet_utils/nnet_common.h |   142 +-
 .../quartus/firmware/nnet_utils/nnet_conv1d.h |   128 +-
 .../quartus/firmware/nnet_utils/nnet_dense.h  |   338 +-
 .../nnet_utils/nnet_dense_compressed.h        |   160 +-
 .../firmware/nnet_utils/nnet_helpers.h        |   280 +-
 .../quartus/firmware/nnet_utils/nnet_merge.h  |   498 +-
 .../quartus/firmware/nnet_utils/nnet_mult.h   |   226 +-
 .../firmware/nnet_utils/nnet_padding.h        |   198 +-
 .../quartus/myproject_test_parallel.cpp       |   224 +-
 hls4ml/templates/vivado/ap_types/ap_common.h  |   752 +-
 hls4ml/templates/vivado/ap_types/ap_decl.h    |   424 +-
 hls4ml/templates/vivado/ap_types/ap_fixed.h   |   720 +-
 .../templates/vivado/ap_types/ap_fixed_base.h |  4708 ++---
 .../templates/vivado/ap_types/ap_fixed_ref.h  |  1436 +-
 .../vivado/ap_types/ap_fixed_special.h        |   460 +-
 hls4ml/templates/vivado/ap_types/ap_int.h     |   660 +-
 .../templates/vivado/ap_types/ap_int_base.h   |  3770 ++--
 hls4ml/templates/vivado/ap_types/ap_int_ref.h |  2692 +--
 .../vivado/ap_types/ap_int_special.h          |   446 +-
 .../vivado/ap_types/etc/ap_private.h          | 14398 ++++++++--------
 hls4ml/templates/vivado/ap_types/hls_stream.h |   526 +-
 .../vivado/ap_types/utils/x_hls_utils.h       |   160 +-
 hls4ml/templates/vivado/build_lib.sh          |    34 +-
 .../templates/vivado/firmware/myproject.cpp   |    46 +-
 hls4ml/templates/vivado/firmware/myproject.h  |    38 +-
 hls4ml/templates/vivado/myproject_test.cpp    |   188 +-
 .../templates/vivado/nnet_utils/nnet_array.h  |   104 +-
 .../vivado/nnet_utils/nnet_batchnorm.h        |   248 +-
 .../vivado/nnet_utils/nnet_batchnorm_stream.h |   246 +-
 .../templates/vivado/nnet_utils/nnet_common.h |   150 +-
 .../templates/vivado/nnet_utils/nnet_conv1d.h |   132 +-
 .../vivado/nnet_utils/nnet_conv1d_stream.h    |   178 +-
 .../templates/vivado/nnet_utils/nnet_conv2d.h |   150 +-
 .../vivado/nnet_utils/nnet_conv2d_latency.h   |   178 +-
 .../vivado/nnet_utils/nnet_dense_compressed.h |   180 +-
 .../vivado/nnet_utils/nnet_dense_latency.h    |   144 +-
 .../vivado/nnet_utils/nnet_dense_resource.h   |   526 +-
 .../templates/vivado/nnet_utils/nnet_garnet.h |  1632 +-
 .../vivado/nnet_utils/nnet_helpers.h          |   764 +-
 .../templates/vivado/nnet_utils/nnet_merge.h  |   512 +-
 .../vivado/nnet_utils/nnet_merge_stream.h     |   740 +-
 .../templates/vivado/nnet_utils/nnet_mult.h   |   232 +-
 .../vivado/nnet_utils/nnet_padding.h          |   290 +-
 .../vivado/nnet_utils/nnet_pooling.h          |   626 +-
 .../vivado/nnet_utils/nnet_recr_activations.h |   112 +-
 .../vivado/nnet_utils/nnet_recurrent.h        |  1172 +-
 .../krnl_rtl_src/krnl_rtl_control_s_axi.v     |   844 +-
 .../alveo/python_drivers/axi_stream_driver.py |   202 +-
 .../pynq-z2/tcl_scripts/axi_lite_design.tcl   |    52 +-
 .../pynq-z2/tcl_scripts/axi_stream_design.tcl |   118 +-
 .../zcu102/tcl_scripts/axi_stream_design.tcl  |   116 +-
 hls4ml/utils/plot.py                          |   448 +-
 70 files changed, 32194 insertions(+), 32194 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 1f0191f232..d0aa96a65b 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -1,44 +1,44 @@
----
-name: Bug report
-about: Something isn't working as expected
-title: ''
-labels: bug
-assignees: ''
-
----
-
-
-## Prerequisites
-Please make sure to check off these prerequisites before submitting a bug report.
-- [ ] Test that the bug appears on the current version of the master branch. Make sure to include the commit hash of the commit you checked out.
-- [ ] Check that the issue hasn't already been reported, by checking the currently open issues.
-- [ ] If there are steps to reproduce the problem, make sure to write them down below.
-- [ ] If relevant, please include the hls4ml project files, which were created directly before and/or after the bug.
-
-## Quick summary
-Please give a brief and concise description of the bug.
-
-## Details
-Please add to the following sections to describe the bug as accurately as possible.
-
-### Steps to Reproduce
-Add what needs to be done to reproduce the bug. Add *commented* code examples and make sure to include the original model files / code, and the commit hash you are working on.
-
-1. Clone the hls4ml repository
-2. Checkout the master branch, with commit hash: [...]
-3. Run conversion [...] on model file with code [...]
-4. [Further steps ...]
-
-### Expected behavior
-Please add a brief description of what you expected to happen.
-
-### Actual behavior
-Describe what actually happens instead.
-
-## Optional
-
-### Possible fix
-If you already know where the issue stems from, or you have a hint please let us know.
-
-### Additional context
-Add any other context about the problem here.
+---
+name: Bug report
+about: Something isn't working as expected
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+
+## Prerequisites
+Please make sure to check off these prerequisites before submitting a bug report.
+- [ ] Test that the bug appears on the current version of the master branch. Make sure to include the commit hash of the commit you checked out.
+- [ ] Check that the issue hasn't already been reported, by checking the currently open issues.
+- [ ] If there are steps to reproduce the problem, make sure to write them down below.
+- [ ] If relevant, please include the hls4ml project files, which were created directly before and/or after the bug.
+
+## Quick summary
+Please give a brief and concise description of the bug.
+
+## Details
+Please add to the following sections to describe the bug as accurately as possible.
+
+### Steps to Reproduce
+Add what needs to be done to reproduce the bug. Add *commented* code examples and make sure to include the original model files / code, and the commit hash you are working on.
+
+1. Clone the hls4ml repository
+2. Checkout the master branch, with commit hash: [...]
+3. Run conversion [...] on model file with code [...]
+4. [Further steps ...]
+
+### Expected behavior
+Please add a brief description of what you expected to happen.
+
+### Actual behavior
+Describe what actually happens instead.
+
+## Optional
+
+### Possible fix
+If you already know where the issue stems from, or you have a hint please let us know.
+
+### Additional context
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 776bc33c31..907ac6db49 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,5 +1,5 @@
-blank_issues_enabled: false
-contact_links:
-  - name: Talk and engage with the comunity
-    url: https://github.com/fastmachinelearning/hls4ml/discussions/categories/general
-    about: Check out the GitHub discusisons page for hls4ml. This is the best way to get in touch with us. In particular, if you have a question about hls4ml or a general problem that is likely not a bug.
+blank_issues_enabled: false
+contact_links:
+  - name: Talk and engage with the comunity
+    url: https://github.com/fastmachinelearning/hls4ml/discussions/categories/general
+    about: Check out the GitHub discusisons page for hls4ml. This is the best way to get in touch with us. In particular, if you have a question about hls4ml or a general problem that is likely not a bug.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 84a6247d50..1739f9d99f 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -1,28 +1,28 @@
----
-name: Feature request
-about: Suggest an idea for hls4ml
-title: ''
-labels: enhancement
-assignees: ''
-
----
-
-## Prerequisites
-Please talk to us before creating a new feature request. So that you can check that the idea is not already in active development.
-
-You can present your idea over here at the GitHub discussions page for hls4ml: https://github.com/fastmachinelearning/hls4ml/discussions/categories/ideas
-
-Even if an idea is already being worked on you can still create a feature request,
-if you would like to open a discussion about the feature or want to contribute to it.
-
-## Details
-Please add to the following sections to describe the feature as accurately as possible.
-
-### New behavior
-Please add a brief and concise description of what you would like to happen in hls4ml in the future.
-
-### Motivation
-Please tell us why this feature is important to the community.
-
-### Parts of hls4ml being affected
-Please describe which parts of hls4ml would be affected by this feature.
+---
+name: Feature request
+about: Suggest an idea for hls4ml
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+## Prerequisites
+Please talk to us before creating a new feature request. So that you can check that the idea is not already in active development.
+
+You can present your idea over here at the GitHub discussions page for hls4ml: https://github.com/fastmachinelearning/hls4ml/discussions/categories/ideas
+
+Even if an idea is already being worked on you can still create a feature request,
+if you would like to open a discussion about the feature or want to contribute to it.
+
+## Details
+Please add to the following sections to describe the feature as accurately as possible.
+
+### New behavior
+Please add a brief and concise description of what you would like to happen in hls4ml in the future.
+
+### Motivation
+Please tell us why this feature is important to the community.
+
+### Parts of hls4ml being affected
+Please describe which parts of hls4ml would be affected by this feature.
diff --git a/hls4ml/backends/vivado/passes/broadcast_stream.py b/hls4ml/backends/vivado/passes/broadcast_stream.py
index ed6ca55f18..ec6322cf78 100644
--- a/hls4ml/backends/vivado/passes/broadcast_stream.py
+++ b/hls4ml/backends/vivado/passes/broadcast_stream.py
@@ -1,117 +1,117 @@
-import numpy as np
-
-from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Concatenate, Layer, Merge, register_layer
-from hls4ml.model.optimizer import OptimizerPass
-
-
-class Broadcast(Layer):
-    '''Inserted between layers for broadcasting.'''
-
-    def initialize(self):
-        shape = self.attributes['target_shape']
-        if shape[0] is None:
-            shape = shape[1:]
-        dims = [f'N_SIZE_{i}_{self.index}' for i in range(1, len(shape) + 1)]
-        self.add_output_variable(shape, dims)
-
-
-broadcast_function_template = 'nnet::broadcast_stream<{input_t}, {output_t}, {config}>({input}, {output});'
-broadcast_config_template = """struct config{index} : nnet::broadcast_config {{
-    static const unsigned in_width = {in_width};
-    static const unsigned in_height = {in_height};
-    static const unsigned in_chan = {in_chan};
-    static const unsigned out_width = {out_width};
-    static const unsigned out_height = {out_height};
-    static const unsigned out_chan = {out_chan};
-}};\n"""
-broadcast_include_list = ['nnet_utils/nnet_stream.h']
-
-
-class BroadcastConfigTemplate(LayerConfigTemplate):
-    def __init__(self):
-        super().__init__(Broadcast)
-        self.template = broadcast_config_template
-
-    def format(self, node):
-        params = self._default_config_params(node)
-        params['in_height'] = node.get_input_variable().shape[0]
-        params['in_width'] = node.get_input_variable().shape[1]
-        params['in_chan'] = node.get_input_variable().shape[2]
-        params['out_height'] = node.get_output_variable().shape[0]
-        params['out_width'] = node.get_output_variable().shape[1]
-        params['out_chan'] = node.get_output_variable().shape[2]
-
-        return self.template.format(**params)
-
-
-class BroadcastFunctionTemplate(FunctionCallTemplate):
-    def __init__(self):
-        super().__init__(Broadcast, include_header=broadcast_include_list)
-        self.template = broadcast_function_template
-
-    def format(self, node):
-        params = self._default_function_params(node)
-        return self.template.format(**params)
-
-
-def register_broadcast_stream(backend):
-    # Register the layer types to the layer map
-    register_layer('Broadcast', Broadcast)
-
-    # Register the optimization passes
-    backend.register_pass('broadcast_stream', BroadcastStream)
-
-    # Register template passes
-    backend.register_template(BroadcastConfigTemplate)
-    backend.register_template(BroadcastFunctionTemplate)
-
-
-class BroadcastStream(OptimizerPass):
-    def match(self, node):
-        if isinstance(node, Merge) and not isinstance(node, Concatenate):
-            inp1 = node.get_input_variable(node.inputs[0])
-            inp2 = node.get_input_variable(node.inputs[1])
-            return inp1.shape != inp2.shape
-        else:
-            return False
-
-    def transform(self, model, node):
-        if model.config.backend.name not in ['Vivado'] or model.config.get_config_value('IOType') != 'io_stream':
-            return False
-
-        inp = [node.get_input_variable(inp_name) for inp_name in node.inputs]
-
-        if np.prod(inp[0].shape) > np.prod(inp[1].shape):
-            idx = 1
-            attrs = {'target_shape': inp[0].shape}
-        else:
-            idx = 0
-            attrs = {'target_shape': inp[1].shape}
-
-        def supported_broadcast(inp_shape, target_shape):
-            # Must be (H, W, C)
-            if not len(inp_shape) == 3:
-                return False
-            # Supported: (1, 1, C) -> (H, W, C)
-            if inp_shape[0] == inp_shape[1] == 1 and inp_shape[2] == target_shape[2]:
-                return True
-            # Supported: (H, W, 1) -> (H, W, C)
-            if inp_shape[2] == 1 and inp_shape[0] == target_shape[0] and inp_shape[1] == target_shape[1]:
-                return True
-            return False
-
-        brdcst_inp = node.inputs[idx]
-        inp_shape = node.get_input_variable(brdcst_inp).shape
-        target_shape = attrs['target_shape']
-        if not supported_broadcast(inp_shape, target_shape):
-            raise RuntimeError(
-                f'Unsupported broadcast type for stream: {inp_shape} -> {target_shape};'
-                + 'Only (1, 1, C) -> (H, W, C) and (H, W, 1) -> (H, W, C) currently supported'
-            )
-        brdcst_out = 'broadcast_' + brdcst_inp
-        brdcst_layer = model.make_node('Broadcast', brdcst_out, attrs, [brdcst_inp].copy())
-        model.insert_node(brdcst_layer, before=node, input_idx=idx)
-        node.inputs[idx] = brdcst_out
-
-        return True
+import numpy as np
+
+from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
+from hls4ml.model.layers import Concatenate, Layer, Merge, register_layer
+from hls4ml.model.optimizer import OptimizerPass
+
+
+class Broadcast(Layer):
+    '''Inserted between layers for broadcasting.'''
+
+    def initialize(self):
+        shape = self.attributes['target_shape']
+        if shape[0] is None:
+            shape = shape[1:]
+        dims = [f'N_SIZE_{i}_{self.index}' for i in range(1, len(shape) + 1)]
+        self.add_output_variable(shape, dims)
+
+
+broadcast_function_template = 'nnet::broadcast_stream<{input_t}, {output_t}, {config}>({input}, {output});'
+broadcast_config_template = """struct config{index} : nnet::broadcast_config {{
+    static const unsigned in_width = {in_width};
+    static const unsigned in_height = {in_height};
+    static const unsigned in_chan = {in_chan};
+    static const unsigned out_width = {out_width};
+    static const unsigned out_height = {out_height};
+    static const unsigned out_chan = {out_chan};
+}};\n"""
+broadcast_include_list = ['nnet_utils/nnet_stream.h']
+
+
+class BroadcastConfigTemplate(LayerConfigTemplate):
+    def __init__(self):
+        super().__init__(Broadcast)
+        self.template = broadcast_config_template
+
+    def format(self, node):
+        params = self._default_config_params(node)
+        params['in_height'] = node.get_input_variable().shape[0]
+        params['in_width'] = node.get_input_variable().shape[1]
+        params['in_chan'] = node.get_input_variable().shape[2]
+        params['out_height'] = node.get_output_variable().shape[0]
+        params['out_width'] = node.get_output_variable().shape[1]
+        params['out_chan'] = node.get_output_variable().shape[2]
+
+        return self.template.format(**params)
+
+
+class BroadcastFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Broadcast, include_header=broadcast_include_list)
+        self.template = broadcast_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        return self.template.format(**params)
+
+
+def register_broadcast_stream(backend):
+    # Register the layer types to the layer map
+    register_layer('Broadcast', Broadcast)
+
+    # Register the optimization passes
+    backend.register_pass('broadcast_stream', BroadcastStream)
+
+    # Register template passes
+    backend.register_template(BroadcastConfigTemplate)
+    backend.register_template(BroadcastFunctionTemplate)
+
+
+class BroadcastStream(OptimizerPass):
+    def match(self, node):
+        if isinstance(node, Merge) and not isinstance(node, Concatenate):
+            inp1 = node.get_input_variable(node.inputs[0])
+            inp2 = node.get_input_variable(node.inputs[1])
+            return inp1.shape != inp2.shape
+        else:
+            return False
+
+    def transform(self, model, node):
+        if model.config.backend.name not in ['Vivado'] or model.config.get_config_value('IOType') != 'io_stream':
+            return False
+
+        inp = [node.get_input_variable(inp_name) for inp_name in node.inputs]
+
+        if np.prod(inp[0].shape) > np.prod(inp[1].shape):
+            idx = 1
+            attrs = {'target_shape': inp[0].shape}
+        else:
+            idx = 0
+            attrs = {'target_shape': inp[1].shape}
+
+        def supported_broadcast(inp_shape, target_shape):
+            # Must be (H, W, C)
+            if not len(inp_shape) == 3:
+                return False
+            # Supported: (1, 1, C) -> (H, W, C)
+            if inp_shape[0] == inp_shape[1] == 1 and inp_shape[2] == target_shape[2]:
+                return True
+            # Supported: (H, W, 1) -> (H, W, C)
+            if inp_shape[2] == 1 and inp_shape[0] == target_shape[0] and inp_shape[1] == target_shape[1]:
+                return True
+            return False
+
+        brdcst_inp = node.inputs[idx]
+        inp_shape = node.get_input_variable(brdcst_inp).shape
+        target_shape = attrs['target_shape']
+        if not supported_broadcast(inp_shape, target_shape):
+            raise RuntimeError(
+                f'Unsupported broadcast type for stream: {inp_shape} -> {target_shape};'
+                + 'Only (1, 1, C) -> (H, W, C) and (H, W, 1) -> (H, W, C) currently supported'
+            )
+        brdcst_out = 'broadcast_' + brdcst_inp
+        brdcst_layer = model.make_node('Broadcast', brdcst_out, attrs, [brdcst_inp].copy())
+        model.insert_node(brdcst_layer, before=node, input_idx=idx)
+        node.inputs[idx] = brdcst_out
+
+        return True
diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json
index e59f20cc18..1279ec22d0 100644
--- a/hls4ml/backends/vivado_accelerator/supported_boards.json
+++ b/hls4ml/backends/vivado_accelerator/supported_boards.json
@@ -1,42 +1,42 @@
-{
-  "pynq-z2": {
-    "part": "xc7z020clg400-1",
-    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
-    "c_drivers": {}
-  },
-  "zcu102": {
-    "part": "xczu9eg-ffvb1156-2-e",
-    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
-    "c_drivers": {}
-  },
-  "alveo-u50": {
-    "part": "xcu50-fsvh2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u250": {
-    "part": "xcu250-figd2104-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u200": {
-    "part": "xcu200-fsgd2104-2-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  },
-  "alveo-u280": {
-    "part": "xcu280-fsvh2892-2L-e",
-    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
-    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
-    "c_drivers": {}
-  }
-}
+{
+  "pynq-z2": {
+    "part": "xc7z020clg400-1",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "zcu102": {
+    "part": "xczu9eg-ffvb1156-2-e",
+    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "c_drivers": {}
+  },
+  "alveo-u50": {
+    "part": "xcu50-fsvh2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u250": {
+    "part": "xcu250-figd2104-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u200": {
+    "part": "xcu200-fsgd2104-2-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  },
+  "alveo-u280": {
+    "part": "xcu280-fsvh2892-2L-e",
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "krnl_rtl_srcs": {"axi_stream":  "krnl_rtl_src"},
+    "c_drivers": {}
+  }
+}
diff --git a/hls4ml/converters/keras/pooling.py b/hls4ml/converters/keras/pooling.py
index b1030168b4..14d6a9236a 100644
--- a/hls4ml/converters/keras/pooling.py
+++ b/hls4ml/converters/keras/pooling.py
@@ -1,91 +1,91 @@
-from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
-from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d, parse_data_format
-
-pooling_layers = ['MaxPooling1D', 'MaxPooling2D', 'AveragePooling1D', 'AveragePooling2D']
-
-
-@keras_handler(*pooling_layers)
-def parse_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Pooling' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    if int(layer['class_name'][-2]) == 1:
-        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        layer['pool_width'] = keras_layer['config']['pool_size'][0]
-        layer['stride_width'] = keras_layer['config']['strides'][0]
-
-        (layer['n_out'], layer['pad_left'], layer['pad_right']) = compute_padding_1d(
-            keras_layer['config']['padding'], layer['n_in'], layer['stride_width'], layer['pool_width']
-        )
-
-        if layer['data_format'] == 'channels_last':
-            output_shape = [input_shapes[0][0], layer['n_out'], layer['n_filt']]
-        elif layer['data_format'] == 'channels_first':
-            output_shape = [input_shapes[0][0], layer['n_filt'], layer['n_out']]
-    elif int(layer['class_name'][-2]) == 2:
-        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        layer['stride_height'] = keras_layer['config']['strides'][0]
-        layer['stride_width'] = keras_layer['config']['strides'][1]
-        layer['pool_height'] = keras_layer['config']['pool_size'][0]
-        layer['pool_width'] = keras_layer['config']['pool_size'][1]
-
-        (
-            layer['out_height'],
-            layer['out_width'],
-            layer['pad_top'],
-            layer['pad_bottom'],
-            layer['pad_left'],
-            layer['pad_right'],
-        ) = compute_padding_2d(
-            keras_layer['config']['padding'],
-            layer['in_height'],
-            layer['in_width'],
-            layer['stride_height'],
-            layer['stride_width'],
-            layer['pool_height'],
-            layer['pool_width'],
-        )
-
-        if layer['data_format'] == 'channels_last':
-            output_shape = [input_shapes[0][0], layer['out_height'], layer['out_width'], layer['n_filt']]
-        elif layer['data_format'] == 'channels_first':
-            output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']]
-
-    return layer, output_shape
-
-
-global_pooling_layers = ['GlobalMaxPooling1D', 'GlobalMaxPooling2D', 'GlobalAveragePooling1D', 'GlobalAveragePooling2D']
-
-
-@keras_handler(*global_pooling_layers)
-def parse_global_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Pooling' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-    layer['keepdims'] = keras_layer['config']['keepdims']
-
-    if int(layer['class_name'][-2]) == 1:
-        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        if layer['keepdims']:
-            if layer['data_format'] == 'channels_last':
-                output_shape = [input_shapes[0][0], 1, layer['n_filt']]
-            elif layer['data_format'] == 'channels_first':
-                output_shape = [input_shapes[0][0], layer['n_filt'], 1]
-        else:
-            output_shape = [input_shapes[0][0], layer['n_filt']]
-    elif int(layer['class_name'][-2]) == 2:
-        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
-
-        if layer['keepdims']:
-            if layer['data_format'] == 'channels_last':
-                output_shape = [input_shapes[0][0], 1, 1, layer['n_filt']]
-            elif layer['data_format'] == 'channels_first':
-                output_shape = [input_shapes[0][0], layer['n_filt'], 1, 1]
-        else:
-            output_shape = [input_shapes[0][0], layer['n_filt']]
-
-    return layer, output_shape
+from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
+from hls4ml.converters.utils import compute_padding_1d, compute_padding_2d, parse_data_format
+
+pooling_layers = ['MaxPooling1D', 'MaxPooling2D', 'AveragePooling1D', 'AveragePooling2D']
+
+
+@keras_handler(*pooling_layers)
+def parse_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Pooling' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    if int(layer['class_name'][-2]) == 1:
+        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        layer['pool_width'] = keras_layer['config']['pool_size'][0]
+        layer['stride_width'] = keras_layer['config']['strides'][0]
+
+        (layer['n_out'], layer['pad_left'], layer['pad_right']) = compute_padding_1d(
+            keras_layer['config']['padding'], layer['n_in'], layer['stride_width'], layer['pool_width']
+        )
+
+        if layer['data_format'] == 'channels_last':
+            output_shape = [input_shapes[0][0], layer['n_out'], layer['n_filt']]
+        elif layer['data_format'] == 'channels_first':
+            output_shape = [input_shapes[0][0], layer['n_filt'], layer['n_out']]
+    elif int(layer['class_name'][-2]) == 2:
+        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        layer['stride_height'] = keras_layer['config']['strides'][0]
+        layer['stride_width'] = keras_layer['config']['strides'][1]
+        layer['pool_height'] = keras_layer['config']['pool_size'][0]
+        layer['pool_width'] = keras_layer['config']['pool_size'][1]
+
+        (
+            layer['out_height'],
+            layer['out_width'],
+            layer['pad_top'],
+            layer['pad_bottom'],
+            layer['pad_left'],
+            layer['pad_right'],
+        ) = compute_padding_2d(
+            keras_layer['config']['padding'],
+            layer['in_height'],
+            layer['in_width'],
+            layer['stride_height'],
+            layer['stride_width'],
+            layer['pool_height'],
+            layer['pool_width'],
+        )
+
+        if layer['data_format'] == 'channels_last':
+            output_shape = [input_shapes[0][0], layer['out_height'], layer['out_width'], layer['n_filt']]
+        elif layer['data_format'] == 'channels_first':
+            output_shape = [input_shapes[0][0], layer['n_filt'], layer['out_height'], layer['out_width']]
+
+    return layer, output_shape
+
+
+global_pooling_layers = ['GlobalMaxPooling1D', 'GlobalMaxPooling2D', 'GlobalAveragePooling1D', 'GlobalAveragePooling2D']
+
+
+@keras_handler(*global_pooling_layers)
+def parse_global_pooling_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Pooling' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+    layer['keepdims'] = keras_layer['config']['keepdims']
+
+    if int(layer['class_name'][-2]) == 1:
+        (layer['n_in'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        if layer['keepdims']:
+            if layer['data_format'] == 'channels_last':
+                output_shape = [input_shapes[0][0], 1, layer['n_filt']]
+            elif layer['data_format'] == 'channels_first':
+                output_shape = [input_shapes[0][0], layer['n_filt'], 1]
+        else:
+            output_shape = [input_shapes[0][0], layer['n_filt']]
+    elif int(layer['class_name'][-2]) == 2:
+        (layer['in_height'], layer['in_width'], layer['n_filt']) = parse_data_format(input_shapes[0], layer['data_format'])
+
+        if layer['keepdims']:
+            if layer['data_format'] == 'channels_last':
+                output_shape = [input_shapes[0][0], 1, 1, layer['n_filt']]
+            elif layer['data_format'] == 'channels_first':
+                output_shape = [input_shapes[0][0], layer['n_filt'], 1, 1]
+        else:
+            output_shape = [input_shapes[0][0], layer['n_filt']]
+
+    return layer, output_shape
diff --git a/hls4ml/templates/quartus/ac_types/ac_channel.h b/hls4ml/templates/quartus/ac_types/ac_channel.h
index 96ff514ce4..62e0542736 100644
--- a/hls4ml/templates/quartus/ac_types/ac_channel.h
+++ b/hls4ml/templates/quartus/ac_types/ac_channel.h
@@ -1,555 +1,555 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_channel.h
-//  Description:    templatized channel communication class
-//  Author:         Andres Takach, Ph.D.
-*/
-
-#ifndef __AC_CHANNEL_H
-#define __AC_CHANNEL_H
-
-#ifndef __cplusplus
-# error C++ is required to include this header file
-#endif
-
-#include <iostream>
-#include <deque>
-
-#if !defined(AC_USER_DEFINED_ASSERT) && !defined(AC_ASSERT_THROW_EXCEPTION)
-# include <assert.h>
-#endif
-
-// not directly used by this include
-#include <stdio.h>
-#include <stdlib.h>
-
-// Macro Definitions (obsolete - provided here for backward compatibility)
-#define AC_CHAN_CTOR(varname) varname
-#define AC_CHAN_CTOR_INIT(varname,init) varname(init)
-#define AC_CHAN_CTOR_VAL(varname,init,val) varname(init,val)
-
-////////////////////////////////////////////////
-// Struct: ac_exception / ac_channel_exception
-////////////////////////////////////////////////
-
-#ifndef __INCLUDED_AC_EXCEPTION
-# define __INCLUDED_AC_EXCEPTION
-struct ac_exception {
-  const char *const file;
-  const unsigned int line;
-  const int code;
-  const char *const msg;
-  ac_exception(const char *file_, const unsigned int &line_, const int &code_, const char *msg_)
-    : file(file_), line(line_), code(code_), msg(msg_) { }
-};
-#endif
-
-struct ac_channel_exception {
-  enum { code_begin = 1024 };
-  enum code {
-    read_from_empty_channel                                     = code_begin,
-    fifo_not_empty_when_reset,
-    no_operator_sb_defined_for_channel_type,
-    no_insert_defined_for_channel_type,
-    no_size_in_connections,
-    no_num_free_in_connections,
-    no_output_empty_in_connections
-  };
-  static inline const char *msg(const code &code_) {
-      static const char *const s[] = {
-          "Read from empty channel",
-          "fifo not empty when reset",
-          "No operator[] defined for channel type",
-          "No insert defined for channel type",
-          "Connections does not support size()",
-          "Connections does not support num_free()",
-          "Connections::Out does not support empty()"
-      };
-      return s[code_-code_begin];
-  }
-};
-
-///////////////////////////////////////////
-// Class: ac_channel
-//////////////////////////////////////////
-
-template <class T>
-class ac_channel {
-public:
-  typedef T element_type;
-
-  // constructors
-  ac_channel();
-  ac_channel(int init);
-  ac_channel(int init, T val);
-
-  T read() { return chan.read(); }
-  void read(T& t) { t = read(); }
-  bool nb_read(T& t) { return chan.nb_read(t); }
-
-  void write(const T& t) { chan.write(t); }
-  bool nb_write(T& t) {
-    chan.incr_size_call_count();
-    return chan.nb_write(t);
-  }
-
-  unsigned int size() {
-    chan.incr_size_call_count();
-    return chan.size();
-  }
-  bool empty() { return chan.empty(); }
-
-  // Return true if channel has at least k entries
-  bool available(unsigned int k) const { return chan.available(k); }
-
-  void reset() { chan.reset(); }
-
-  unsigned int debug_size() const { return chan.size(); }
-
-  const T &operator[](unsigned int pos) const { return chan[pos]; }
-
-  int get_size_call_count() { return chan.get_size_call_count(); }
-
-#ifdef SYSTEMC_INCLUDED
-  void bind(sc_fifo_in<T> &f) { chan.bind(f); }
-  void bind(sc_fifo_out<T> &f) { chan.bind(f); }
-#endif
-
-#ifdef __CONNECTIONS__CONNECTIONS_H__
-  void bind(Connections::Out<T>& c)   { chan.bind(c); }
-  void bind(Connections::In<T>& c)    { chan.bind(c); }
-  void bind(Connections::SyncIn  &c)  { chan.bind(c); }
-  void bind(Connections::SyncOut &c)  { chan.bind(c); }
-#endif
-
-private:
-# ifndef AC_CHANNEL_ASSERT
-#   define AC_CHANNEL_ASSERT(cond, code) ac_assert(cond, __FILE__, __LINE__, code)
-    static inline void ac_assert(bool condition, const char *file, int line, const ac_channel_exception::code &code) {
-#     ifndef AC_USER_DEFINED_ASSERT
-        if(!condition) {
-          const ac_exception e(file, line, code, ac_channel_exception::msg(code));
-#        ifdef AC_ASSERT_THROW_EXCEPTION
-#         ifdef AC_ASSERT_THROW_EXCEPTION_AS_CONST_CHAR
-           throw(e.msg);
-#         else
-           throw(e);
-#         endif
-#        else
-          std::cerr << "Assert";
-          if(e.file)
-            std::cerr << " in file " << e.file << ":" << e.line;
-          std::cerr << " " << e.msg << std::endl;
-          assert(0);
-#        endif
-        }
-#     else
-        AC_USER_DEFINED_ASSERT(condition, file, line, ac_channel_exception::msg(code));
-#     endif
-    }
-# else
-#   error "private use only - AC_CHANNEL_ASSERT macro already defined"
-# endif
-
-public:
-  class fifo {
-    enum fifo_type {
-        fifo_ac_channel_type,
-        fifo_sc_fifo_type,
-        fifo_connections_type,
-        fifo_connections_sync_type
-    };
-
-    struct fifo_abstract {
-      virtual ~fifo_abstract() {}
-      virtual fifo_type get_fifo_type() const = 0;
-      virtual T read() = 0;
-      virtual bool nb_read(T& t) = 0;
-      virtual void write(const T& t) = 0;
-      virtual bool nb_write(T& t) = 0;
-      virtual bool empty() = 0;
-      virtual bool available(unsigned int k) const = 0;
-      virtual unsigned int size() const = 0;
-      virtual unsigned int num_free() const = 0;
-      virtual void reset() = 0;
-      virtual const T &operator_sb(const unsigned int &pos, const T &default_value) const = 0;
-    };
-
-    struct fifo_ac_channel : fifo_abstract {
-      std::deque<T> ch;
-
-      ~fifo_ac_channel() {}
-
-      static inline fifo_type ftype() { return fifo_ac_channel_type; }
-
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      T read() {
-        {
-          // If you hit this assert you attempted a read on an empty channel. Perhaps
-          // you need to guard the execution of the read with a call to the available()
-          // function:
-          //    if (myInputChan.available(2)) {
-          //      // it is safe to read two values
-          //      cout << myInputChan.read();
-          //      cout << myInputChan.read();
-          //    }
-          AC_CHANNEL_ASSERT(!empty(), ac_channel_exception::read_from_empty_channel);
-        }
-        T t = ch.front();
-        ch.pop_front();
-        return t;
-      }
-      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
-
-      void write(const T& t) { ch.push_back(t); }
-      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
-
-      bool empty() {  return size() == 0; }
-      bool available(unsigned int k) const { return size() >= k; }
-      unsigned int size() const { return (int)ch.size(); }
-      unsigned int num_free() const { return ch.max_size() - ch.size(); }
-
-      void reset() { ch.clear(); }
-
-      const T &operator_sb(const unsigned int &pos, const T &) const {
-        return ch[pos];
-      }
-    };
-
-#ifdef SYSTEMC_INCLUDED
-    struct fifo_sc_fifo : fifo_abstract {
-      sc_fifo_in<T> *fifo_in;
-      sc_fifo_out<T> *fifo_out;
-
-      ~fifo_sc_fifo() {}
-
-      static inline fifo_type ftype() { return fifo_sc_fifo_type; }
-
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      T read() { return fifo_in->read(); }
-      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
-
-      void write(const T& t) { fifo_out->write(t); }
-      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
-
-      bool empty() {  return size() == 0; }
-      bool available(unsigned int k) const { return size() >= k; }
-      unsigned int size() const { return fifo_in->num_available(); }
-      unsigned int num_free() const { return fifo_out->num_free(); }
-
-      void reset() {
-        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
-      }
-
-      const T &operator_sb(const unsigned int &, const T &default_value) const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
-        return default_value;
-      }
-    };
-public:
-    void bind(sc_fifo_in<T> &f) { get_fifo<fifo_sc_fifo>().fifo_in = &f; }
-    void bind(sc_fifo_out<T> &f) { get_fifo<fifo_sc_fifo>().fifo_out = &f; }
-private:
-#endif
-
-#ifdef __CONNECTIONS__CONNECTIONS_H__
-    struct fifo_connections : fifo_abstract {
-      Connections::In<T>  *fifo_in;
-      Connections::Out<T> *fifo_out;
-
-      ~fifo_connections() {}
-      static inline fifo_type ftype() { return fifo_connections_type; }
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      T read() { return fifo_in->Pop(); }
-      bool nb_read(T& t) { return fifo_in->PopNB(t); }
-
-      void write(const T& t) { fifo_out->Push(t); }
-      bool nb_write(T& t) { return fifo_out->PushNB(t); }
-
-      bool empty() {
-        if (fifo_in)
-          return fifo_in->Empty();
-        else
-          AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
-        return false;
-      }
-      bool available(unsigned int k) const { return true; }
-      unsigned int size() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
-        return 0;
-      }
-      unsigned int num_free() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
-        return 0;
-      }
-
-      void reset() {
-        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
-      }
-
-      const T &operator_sb(const unsigned int &, const T &default_value) const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
-        return default_value;
-      }
-    };
-
-    struct fifo_connections_sync : fifo_abstract {
-      Connections::SyncIn      *sync_in;
-      Connections::SyncOut     *sync_out;
-
-      ~fifo_connections_sync() {}
-      static inline fifo_type ftype() { return fifo_connections_sync_type; }
-      fifo_type get_fifo_type() const { return ftype(); }
-
-      bool read() { sync_in->sync_in(); return true; }
-      bool nb_read(T& t) { t=true; return(sync_in->nb_sync_in()); }
-
-      void write(const T& t) { sync_out->sync_out(); }
-      bool nb_write(T& t) { sync_out->sync_out(); return true; }
-
-      bool empty() {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
-        return(false);
-      }
-      bool available(unsigned int k) const { return true; }
-      unsigned int size() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
-        return 0;
-      }
-      unsigned int num_free() const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
-        return 0;
-      }
-      void reset() {
-        if (sync_in) sync_in->reset_sync_in();
-        if (sync_out) sync_out->reset_sync_out();
-      }
-      const T &operator_sb(const unsigned int &, const T &default_value) const {
-        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
-        return default_value;
-      }
-    };
-
-
-  public:
-    void bind(Connections::In<T>& c) { get_fifo<fifo_connections>().fifo_in = &c; }
-    void bind(Connections::Out<T>& c) { get_fifo<fifo_connections>().fifo_out = &c; }
-
-    void bind(Connections::SyncIn  &c)  { get_fifo<fifo_connections_sync>().sync_in = &c; }
-    void bind(Connections::SyncOut &c)  { get_fifo<fifo_connections_sync>().sync_out = &c; }
-
-  private:
-#endif
-
-    template<typename fifo_T>
-    fifo_T &get_fifo() {
-      if (!f || f->get_fifo_type() != fifo_T::ftype()) {
-        if (f) {
-          AC_CHANNEL_ASSERT(f->empty(), ac_channel_exception::fifo_not_empty_when_reset);
-          delete f;
-        }
-        f = new fifo_T;
-      }
-      return static_cast<fifo_T &>(*f);
-    }
-
-    fifo_abstract *f;
-    unsigned int rSz;    // reset size
-    T rVal;              // resetValue
-    int size_call_count;
-
-  public:
-    fifo() : f(0), rSz(0), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
-    fifo(int init) : f(0), rSz(init), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
-    fifo(int init, T val) : f(0), rSz(init), rVal(val), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
-    ~fifo() { delete f; }
-
-    inline T read() { return f->read(); }
-    inline bool nb_read(T& t) { return f->nb_read(t); }
-
-    inline void write(const T& t) { f->write(t); }
-    inline bool nb_write(T& t) { return f->nb_write(t); }
-
-    inline bool empty() { return f->empty(); }
-    inline bool available(unsigned int k) const { return f->available(k); }
-    inline unsigned int size() const { return f->size(); }
-    inline unsigned int num_free() const { return f->num_free(); }
-
-    inline void reset() {
-      f->reset();
-      for (int i=0; i<(int)rSz; i++)
-        write(rVal);
-    }
-
-    inline const T &operator[](unsigned int pos) const { return f->operator_sb(pos, rVal); }
-
-    void incr_size_call_count() { ++size_call_count; }
-    int get_size_call_count() {
-      int tmp=size_call_count;
-      size_call_count=0;
-      return tmp;
-    }
-
-    // obsolete - provided here for backward compatibility with ac_channel
-    struct iterator {
-      iterator operator+(unsigned int pos_) const {
-        return iterator(itr, pos_);
-      }
-    private:
-      friend class fifo;
-      iterator(const typename std::deque<T>::iterator &itr_, unsigned int pos=0)
-        : itr(itr_) { if (pos) itr += pos; }
-      typename std::deque<T>::iterator itr;
-    };
-    iterator begin() {
-      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
-      return iterator(get_fifo<fifo_ac_channel>().ch.begin());
-    }
-    void insert(iterator itr, const T& t) {
-      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
-      get_fifo<fifo_ac_channel>().ch.insert(itr.itr,t);
-    }
-  };
-  fifo chan;
-
-private:
-  // Prevent the compiler from autogenerating these.
-  //  (This enforces that channels are always passed by reference.)
-  ac_channel(const ac_channel< T >&);
-  ac_channel& operator=(const ac_channel< T >&);
-};
-
-template <class T>
-ac_channel<T>::ac_channel() : chan() {}
-
-template <class T>
-ac_channel<T>::ac_channel(int init) : chan(init)
-{
-  for (int i=init; i>0; i--) {
-    T dc;
-    write(dc);
-  }
-}
-
-template <class T>
-ac_channel<T>::ac_channel(int init, T val) : chan(init, val)
-{
-  for (int i=init; i>0; i--)
-    write(val);
-}
-
-template<class T>
-inline std::ostream& operator<< (std::ostream& os, ac_channel<T> &a)
-{
-  for (unsigned int i=0; i<a.size(); i++) {
-    if (i > 0) os << " ";
-    os << a[i];
-  }
-  return os;
-}
-
-// This general case is meant to cover non channel (or array of them) args
-//   Its result will be ignored
-template<typename T>
-bool nb_read_chan_rdy(T &x) { return true; }
-
-template<typename T>
-bool nb_read_chan_rdy(ac_channel<T> &chan) { return !chan.empty(); }
-
-template<typename T, int N>
-bool nb_read_chan_rdy(ac_channel<T> (&chan)[N] ) {
-  bool r = true;
-  for(int i=0; i<N; i++)
-    r &= !chan[i].empty();
-  return r;
-}
-
-#if __cplusplus > 199711L
-template<typename ...Args>
-bool nb_read_chan_rdy(Args&... args) {
-  const int n_args = sizeof...(args);
-  // only every other arg is a channel (or an array of channels)
-  bool rdy[n_args] = { (nb_read_chan_rdy(args))... };
-  bool r = true;
-  for(int i=0; i < n_args; i+=2)
-    r &= rdy[i];
-  return r;
-}
-#endif
-
-template<typename T>
-void nb_read_r(ac_channel<T> &chan, T &var) {
-  chan.nb_read(var);
-}
-
-template<typename T, int N>
-void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N]) {
-  for(int i=0; i<N; i++)
-    chan[i].nb_read(var[i]);
-}
-
-#if __cplusplus > 199711L
-template<typename T, typename ...Args>
-void nb_read_r(ac_channel<T> &chan, T &var, Args&... args) {
-  chan.nb_read(var);
-  nb_read_r(args...);
-}
-
-template<typename T, int N, typename ...Args>
-void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N], Args&... args) {
-  for(int i=0; i<N; i++)
-    chan[i].nb_read(var[i]);
-  nb_read_r(args...);
-}
-
-template<typename ...Args>
-bool nb_read_join(Args&... args) {
-  if(nb_read_chan_rdy(args...)) {
-    nb_read_r(args...);
-    return true;
-  }
-  return false;
-}
-#endif
-
-/* undo macro adjustments */
-#ifdef AC_CHANNEL_ASSERT
-#  undef AC_CHANNEL_ASSERT
-#endif
-
-#endif
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_channel.h
+//  Description:    templatized channel communication class
+//  Author:         Andres Takach, Ph.D.
+*/
+
+#ifndef __AC_CHANNEL_H
+#define __AC_CHANNEL_H
+
+#ifndef __cplusplus
+# error C++ is required to include this header file
+#endif
+
+#include <iostream>
+#include <deque>
+
+#if !defined(AC_USER_DEFINED_ASSERT) && !defined(AC_ASSERT_THROW_EXCEPTION)
+# include <assert.h>
+#endif
+
+// not directly used by this include
+#include <stdio.h>
+#include <stdlib.h>
+
+// Macro Definitions (obsolete - provided here for backward compatibility)
+#define AC_CHAN_CTOR(varname) varname
+#define AC_CHAN_CTOR_INIT(varname,init) varname(init)
+#define AC_CHAN_CTOR_VAL(varname,init,val) varname(init,val)
+
+////////////////////////////////////////////////
+// Struct: ac_exception / ac_channel_exception
+////////////////////////////////////////////////
+
+#ifndef __INCLUDED_AC_EXCEPTION
+# define __INCLUDED_AC_EXCEPTION
+struct ac_exception {
+  const char *const file;
+  const unsigned int line;
+  const int code;
+  const char *const msg;
+  ac_exception(const char *file_, const unsigned int &line_, const int &code_, const char *msg_)
+    : file(file_), line(line_), code(code_), msg(msg_) { }
+};
+#endif
+
+struct ac_channel_exception {
+  enum { code_begin = 1024 };
+  enum code {
+    read_from_empty_channel                                     = code_begin,
+    fifo_not_empty_when_reset,
+    no_operator_sb_defined_for_channel_type,
+    no_insert_defined_for_channel_type,
+    no_size_in_connections,
+    no_num_free_in_connections,
+    no_output_empty_in_connections
+  };
+  static inline const char *msg(const code &code_) {
+      static const char *const s[] = {
+          "Read from empty channel",
+          "fifo not empty when reset",
+          "No operator[] defined for channel type",
+          "No insert defined for channel type",
+          "Connections does not support size()",
+          "Connections does not support num_free()",
+          "Connections::Out does not support empty()"
+      };
+      return s[code_-code_begin];
+  }
+};
+
+///////////////////////////////////////////
+// Class: ac_channel
+//////////////////////////////////////////
+
+template <class T>
+class ac_channel {
+public:
+  typedef T element_type;
+
+  // constructors
+  ac_channel();
+  ac_channel(int init);
+  ac_channel(int init, T val);
+
+  T read() { return chan.read(); }
+  void read(T& t) { t = read(); }
+  bool nb_read(T& t) { return chan.nb_read(t); }
+
+  void write(const T& t) { chan.write(t); }
+  bool nb_write(T& t) {
+    chan.incr_size_call_count();
+    return chan.nb_write(t);
+  }
+
+  unsigned int size() {
+    chan.incr_size_call_count();
+    return chan.size();
+  }
+  bool empty() { return chan.empty(); }
+
+  // Return true if channel has at least k entries
+  bool available(unsigned int k) const { return chan.available(k); }
+
+  void reset() { chan.reset(); }
+
+  unsigned int debug_size() const { return chan.size(); }
+
+  const T &operator[](unsigned int pos) const { return chan[pos]; }
+
+  int get_size_call_count() { return chan.get_size_call_count(); }
+
+#ifdef SYSTEMC_INCLUDED
+  void bind(sc_fifo_in<T> &f) { chan.bind(f); }
+  void bind(sc_fifo_out<T> &f) { chan.bind(f); }
+#endif
+
+#ifdef __CONNECTIONS__CONNECTIONS_H__
+  void bind(Connections::Out<T>& c)   { chan.bind(c); }
+  void bind(Connections::In<T>& c)    { chan.bind(c); }
+  void bind(Connections::SyncIn  &c)  { chan.bind(c); }
+  void bind(Connections::SyncOut &c)  { chan.bind(c); }
+#endif
+
+private:
+# ifndef AC_CHANNEL_ASSERT
+#   define AC_CHANNEL_ASSERT(cond, code) ac_assert(cond, __FILE__, __LINE__, code)
+    static inline void ac_assert(bool condition, const char *file, int line, const ac_channel_exception::code &code) {
+#     ifndef AC_USER_DEFINED_ASSERT
+        if(!condition) {
+          const ac_exception e(file, line, code, ac_channel_exception::msg(code));
+#        ifdef AC_ASSERT_THROW_EXCEPTION
+#         ifdef AC_ASSERT_THROW_EXCEPTION_AS_CONST_CHAR
+           throw(e.msg);
+#         else
+           throw(e);
+#         endif
+#        else
+          std::cerr << "Assert";
+          if(e.file)
+            std::cerr << " in file " << e.file << ":" << e.line;
+          std::cerr << " " << e.msg << std::endl;
+          assert(0);
+#        endif
+        }
+#     else
+        AC_USER_DEFINED_ASSERT(condition, file, line, ac_channel_exception::msg(code));
+#     endif
+    }
+# else
+#   error "private use only - AC_CHANNEL_ASSERT macro already defined"
+# endif
+
+public:
+  class fifo {
+    enum fifo_type {
+        fifo_ac_channel_type,
+        fifo_sc_fifo_type,
+        fifo_connections_type,
+        fifo_connections_sync_type
+    };
+
+    struct fifo_abstract {
+      virtual ~fifo_abstract() {}
+      virtual fifo_type get_fifo_type() const = 0;
+      virtual T read() = 0;
+      virtual bool nb_read(T& t) = 0;
+      virtual void write(const T& t) = 0;
+      virtual bool nb_write(T& t) = 0;
+      virtual bool empty() = 0;
+      virtual bool available(unsigned int k) const = 0;
+      virtual unsigned int size() const = 0;
+      virtual unsigned int num_free() const = 0;
+      virtual void reset() = 0;
+      virtual const T &operator_sb(const unsigned int &pos, const T &default_value) const = 0;
+    };
+
+    struct fifo_ac_channel : fifo_abstract {
+      std::deque<T> ch;
+
+      ~fifo_ac_channel() {}
+
+      static inline fifo_type ftype() { return fifo_ac_channel_type; }
+
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      T read() {
+        {
+          // If you hit this assert you attempted a read on an empty channel. Perhaps
+          // you need to guard the execution of the read with a call to the available()
+          // function:
+          //    if (myInputChan.available(2)) {
+          //      // it is safe to read two values
+          //      cout << myInputChan.read();
+          //      cout << myInputChan.read();
+          //    }
+          AC_CHANNEL_ASSERT(!empty(), ac_channel_exception::read_from_empty_channel);
+        }
+        T t = ch.front();
+        ch.pop_front();
+        return t;
+      }
+      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
+
+      void write(const T& t) { ch.push_back(t); }
+      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
+
+      bool empty() {  return size() == 0; }
+      bool available(unsigned int k) const { return size() >= k; }
+      unsigned int size() const { return (int)ch.size(); }
+      unsigned int num_free() const { return ch.max_size() - ch.size(); }
+
+      void reset() { ch.clear(); }
+
+      const T &operator_sb(const unsigned int &pos, const T &) const {
+        return ch[pos];
+      }
+    };
+
+#ifdef SYSTEMC_INCLUDED
+    struct fifo_sc_fifo : fifo_abstract {
+      sc_fifo_in<T> *fifo_in;
+      sc_fifo_out<T> *fifo_out;
+
+      ~fifo_sc_fifo() {}
+
+      static inline fifo_type ftype() { return fifo_sc_fifo_type; }
+
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      T read() { return fifo_in->read(); }
+      bool nb_read(T& t) { return empty() ? false : (t = read(), true); }
+
+      void write(const T& t) { fifo_out->write(t); }
+      bool nb_write(T& t) { return !num_free() ? false : (write(t), true); }
+
+      bool empty() {  return size() == 0; }
+      bool available(unsigned int k) const { return size() >= k; }
+      unsigned int size() const { return fifo_in->num_available(); }
+      unsigned int num_free() const { return fifo_out->num_free(); }
+
+      void reset() {
+        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
+      }
+
+      const T &operator_sb(const unsigned int &, const T &default_value) const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
+        return default_value;
+      }
+    };
+public:
+    void bind(sc_fifo_in<T> &f) { get_fifo<fifo_sc_fifo>().fifo_in = &f; }
+    void bind(sc_fifo_out<T> &f) { get_fifo<fifo_sc_fifo>().fifo_out = &f; }
+private:
+#endif
+
+#ifdef __CONNECTIONS__CONNECTIONS_H__
+    struct fifo_connections : fifo_abstract {
+      Connections::In<T>  *fifo_in;
+      Connections::Out<T> *fifo_out;
+
+      ~fifo_connections() {}
+      static inline fifo_type ftype() { return fifo_connections_type; }
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      T read() { return fifo_in->Pop(); }
+      bool nb_read(T& t) { return fifo_in->PopNB(t); }
+
+      void write(const T& t) { fifo_out->Push(t); }
+      bool nb_write(T& t) { return fifo_out->PushNB(t); }
+
+      bool empty() {
+        if (fifo_in)
+          return fifo_in->Empty();
+        else
+          AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
+        return false;
+      }
+      bool available(unsigned int k) const { return true; }
+      unsigned int size() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
+        return 0;
+      }
+      unsigned int num_free() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
+        return 0;
+      }
+
+      void reset() {
+        AC_CHANNEL_ASSERT(empty(), ac_channel_exception::fifo_not_empty_when_reset);
+      }
+
+      const T &operator_sb(const unsigned int &, const T &default_value) const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
+        return default_value;
+      }
+    };
+
+    struct fifo_connections_sync : fifo_abstract {
+      Connections::SyncIn      *sync_in;
+      Connections::SyncOut     *sync_out;
+
+      ~fifo_connections_sync() {}
+      static inline fifo_type ftype() { return fifo_connections_sync_type; }
+      fifo_type get_fifo_type() const { return ftype(); }
+
+      bool read() { sync_in->sync_in(); return true; }
+      bool nb_read(T& t) { t=true; return(sync_in->nb_sync_in()); }
+
+      void write(const T& t) { sync_out->sync_out(); }
+      bool nb_write(T& t) { sync_out->sync_out(); return true; }
+
+      bool empty() {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_output_empty_in_connections);
+        return(false);
+      }
+      bool available(unsigned int k) const { return true; }
+      unsigned int size() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_size_in_connections);
+        return 0;
+      }
+      unsigned int num_free() const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_num_free_in_connections);
+        return 0;
+      }
+      void reset() {
+        if (sync_in) sync_in->reset_sync_in();
+        if (sync_out) sync_out->reset_sync_out();
+      }
+      const T &operator_sb(const unsigned int &, const T &default_value) const {
+        AC_CHANNEL_ASSERT(0, ac_channel_exception::no_operator_sb_defined_for_channel_type);
+        return default_value;
+      }
+    };
+
+
+  public:
+    void bind(Connections::In<T>& c) { get_fifo<fifo_connections>().fifo_in = &c; }
+    void bind(Connections::Out<T>& c) { get_fifo<fifo_connections>().fifo_out = &c; }
+
+    void bind(Connections::SyncIn  &c)  { get_fifo<fifo_connections_sync>().sync_in = &c; }
+    void bind(Connections::SyncOut &c)  { get_fifo<fifo_connections_sync>().sync_out = &c; }
+
+  private:
+#endif
+
+    template<typename fifo_T>
+    fifo_T &get_fifo() {
+      if (!f || f->get_fifo_type() != fifo_T::ftype()) {
+        if (f) {
+          AC_CHANNEL_ASSERT(f->empty(), ac_channel_exception::fifo_not_empty_when_reset);
+          delete f;
+        }
+        f = new fifo_T;
+      }
+      return static_cast<fifo_T &>(*f);
+    }
+
+    fifo_abstract *f;
+    unsigned int rSz;    // reset size
+    T rVal;              // resetValue
+    int size_call_count;
+
+  public:
+    fifo() : f(0), rSz(0), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
+    fifo(int init) : f(0), rSz(init), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
+    fifo(int init, T val) : f(0), rSz(init), rVal(val), size_call_count(0) { get_fifo<fifo_ac_channel>(); }
+    ~fifo() { delete f; }
+
+    inline T read() { return f->read(); }
+    inline bool nb_read(T& t) { return f->nb_read(t); }
+
+    inline void write(const T& t) { f->write(t); }
+    inline bool nb_write(T& t) { return f->nb_write(t); }
+
+    inline bool empty() { return f->empty(); }
+    inline bool available(unsigned int k) const { return f->available(k); }
+    inline unsigned int size() const { return f->size(); }
+    inline unsigned int num_free() const { return f->num_free(); }
+
+    inline void reset() {
+      f->reset();
+      for (int i=0; i<(int)rSz; i++)
+        write(rVal);
+    }
+
+    inline const T &operator[](unsigned int pos) const { return f->operator_sb(pos, rVal); }
+
+    void incr_size_call_count() { ++size_call_count; }
+    int get_size_call_count() {
+      int tmp=size_call_count;
+      size_call_count=0;
+      return tmp;
+    }
+
+    // obsolete - provided here for backward compatibility with ac_channel
+    struct iterator {
+      iterator operator+(unsigned int pos_) const {
+        return iterator(itr, pos_);
+      }
+    private:
+      friend class fifo;
+      iterator(const typename std::deque<T>::iterator &itr_, unsigned int pos=0)
+        : itr(itr_) { if (pos) itr += pos; }
+      typename std::deque<T>::iterator itr;
+    };
+    iterator begin() {
+      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
+      return iterator(get_fifo<fifo_ac_channel>().ch.begin());
+    }
+    void insert(iterator itr, const T& t) {
+      AC_CHANNEL_ASSERT(f->get_fifo_type() == fifo_ac_channel_type, ac_channel_exception::no_insert_defined_for_channel_type);
+      get_fifo<fifo_ac_channel>().ch.insert(itr.itr,t);
+    }
+  };
+  fifo chan;
+
+private:
+  // Prevent the compiler from autogenerating these.
+  //  (This enforces that channels are always passed by reference.)
+  ac_channel(const ac_channel< T >&);
+  ac_channel& operator=(const ac_channel< T >&);
+};
+
+template <class T>
+ac_channel<T>::ac_channel() : chan() {}
+
+template <class T>
+ac_channel<T>::ac_channel(int init) : chan(init)
+{
+  for (int i=init; i>0; i--) {
+    T dc;
+    write(dc);
+  }
+}
+
+template <class T>
+ac_channel<T>::ac_channel(int init, T val) : chan(init, val)
+{
+  for (int i=init; i>0; i--)
+    write(val);
+}
+
+template<class T>
+inline std::ostream& operator<< (std::ostream& os, ac_channel<T> &a)
+{
+  for (unsigned int i=0; i<a.size(); i++) {
+    if (i > 0) os << " ";
+    os << a[i];
+  }
+  return os;
+}
+
+// This general case is meant to cover non channel (or array of them) args
+//   Its result will be ignored
+template<typename T>
+bool nb_read_chan_rdy(T &x) { return true; }
+
+template<typename T>
+bool nb_read_chan_rdy(ac_channel<T> &chan) { return !chan.empty(); }
+
+template<typename T, int N>
+bool nb_read_chan_rdy(ac_channel<T> (&chan)[N] ) {
+  bool r = true;
+  for(int i=0; i<N; i++)
+    r &= !chan[i].empty();
+  return r;
+}
+
+#if __cplusplus > 199711L
+template<typename ...Args>
+bool nb_read_chan_rdy(Args&... args) {
+  const int n_args = sizeof...(args);
+  // only every other arg is a channel (or an array of channels)
+  bool rdy[n_args] = { (nb_read_chan_rdy(args))... };
+  bool r = true;
+  for(int i=0; i < n_args; i+=2)
+    r &= rdy[i];
+  return r;
+}
+#endif
+
+template<typename T>
+void nb_read_r(ac_channel<T> &chan, T &var) {
+  chan.nb_read(var);
+}
+
+template<typename T, int N>
+void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N]) {
+  for(int i=0; i<N; i++)
+    chan[i].nb_read(var[i]);
+}
+
+#if __cplusplus > 199711L
+template<typename T, typename ...Args>
+void nb_read_r(ac_channel<T> &chan, T &var, Args&... args) {
+  chan.nb_read(var);
+  nb_read_r(args...);
+}
+
+template<typename T, int N, typename ...Args>
+void nb_read_r(ac_channel<T> (&chan)[N], T (&var)[N], Args&... args) {
+  for(int i=0; i<N; i++)
+    chan[i].nb_read(var[i]);
+  nb_read_r(args...);
+}
+
+template<typename ...Args>
+bool nb_read_join(Args&... args) {
+  if(nb_read_chan_rdy(args...)) {
+    nb_read_r(args...);
+    return true;
+  }
+  return false;
+}
+#endif
+
+/* undo macro adjustments */
+#ifdef AC_CHANNEL_ASSERT
+#  undef AC_CHANNEL_ASSERT
+#endif
+
+#endif
diff --git a/hls4ml/templates/quartus/ac_types/ac_complex.h b/hls4ml/templates/quartus/ac_types/ac_complex.h
index 555b4c89d2..56821a053d 100644
--- a/hls4ml/templates/quartus/ac_types/ac_complex.h
+++ b/hls4ml/templates/quartus/ac_types/ac_complex.h
@@ -1,445 +1,445 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2008-2019, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_complex.h
-//  Description:    complex type with parameterized type that can be:
-//                    - C integer types
-//                    - C floating point types
-//                    - ac_int
-//                    - ac_fixed
-//                    - ac_float
-//                  ac_complex based on C integers, ac_int, ac_fixed and ac_float can
-//                  be mixed
-//  Author:         Andres Takach, Ph.D.
-*/
-
-#ifndef __AC_COMPLEX_H
-#define __AC_COMPLEX_H
-
-#include <ac_float.h>
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-template<typename T> class ac_complex;
-
-namespace ac_private {
-  // specializations after definition of ac_complex
-  template<typename T>
-  struct rt_ac_complex_T {
-    template<typename T2>
-    struct op1 {
-      typedef typename T::template rt_T< ac_complex<T2> >::mult mult;
-      typedef typename T::template rt_T< ac_complex<T2> >::plus plus;
-      typedef typename T::template rt_T< ac_complex<T2> >::minus2 minus;
-      typedef typename T::template rt_T< ac_complex<T2> >::minus minus2;
-      typedef typename T::template rt_T< ac_complex<T2> >::logic logic;
-      typedef typename T::template rt_T< ac_complex<T2> >::div2 div;
-      typedef typename T::template rt_T< ac_complex<T2> >::div div2;
-    };
-  };
-}  // namespace ac_private
-
-template<typename T>
-class ac_complex {
-public:   // temporary workaround
-  T _r;
-  T _i;
-  typedef typename ac_private::map<T>::t map_T;
-  typedef typename map_T::rt_unary::mag_sqr T_sqr;
-  typedef typename ac_private::map<T_sqr>::t map_T_sqr;
-  typedef typename ac_private::map<typename map_T::rt_unary::mag>::t map_T_mag;
-public:
-  typedef T element_type;
-  template<typename T2>
-  struct rt_T {
-    typedef typename ac_private::map<T2>::t map_T2;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::mult mult;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::plus plus;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus minus;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus2 minus2;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::logic logic;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div div;
-    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div2 div2;
-    typedef ac_complex<T> arg1;
-  };
-
-  struct rt_unary {
-    typedef typename map_T_sqr::template rt_T<map_T_sqr>::plus  mag_sqr;
-    typedef typename map_T_mag::template rt_T<map_T_mag>::plus  mag;   // overly conservative for signed
-    typedef ac_complex<typename map_T::rt_unary::neg>  neg;
-    template<unsigned N>
-    struct set {
-      typedef ac_complex<typename map_T::rt_unary::template set<N>::sum> sum;
-    };
-  };
-
-  ac_complex() { }
-  template<typename T2>
-  ac_complex(const ac_complex<T2> &c) : _r(c.r()), _i(c.i()) {}
-  template<typename T2>
-  ac_complex(const T2 &r) : _r(r), _i(0) {}
-  template<typename T2, typename T3>
-  ac_complex(const T2 &r, const T3 &i) : _r(r), _i(i) {}
-  const T &r() const { return _r; }
-  const T &i() const { return _i; }
-  T &r() { return _r; }
-  T &i() { return _i; }
-  const T &real() const { return _r; }
-  const T &imag() const { return _i; }
-  T &real() { return _r; }
-  T &imag() { return _i; }
-  template<typename T2>
-  void set_r(const T2 &r) { _r = r;}
-  template<typename T2>
-  void set_i(const T2 &i) { _i = i;}
-
-  // const binary operators are global rather than members because of compiler errors due to ambiguity
-  // (would appear as a compiler bug)
-
-  template<typename T2>
-  ac_complex &operator +=(const ac_complex<T2> &op2) {
-    _r += op2.r();
-    _i += op2.i();
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator +=(const T2 &op2) {
-    _r += op2;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator -=(const ac_complex<T2> &op2) {
-    _r -= op2.r();
-    _i -= op2.i();
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator -=(const T2 &op2) {
-    _r -= op2;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator *=(const ac_complex<T2> &op2) {
-    T r0 = _r*op2.r() - _i*op2.i();
-    _i = _r*op2.i() + _i*op2.r();
-    _r = r0;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator *=(const T2 &op2) {
-    _r = _r*op2;
-    _i = _i*op2;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator /=(const ac_complex<T2> &op2) {
-    typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
-    T r0 = (_r*op2.r() + _i*op2.i())/d;
-    _i = (_i*op2.r() - _r*op2.i())/d;
-    _r = r0;
-    return *this;
-  }
-
-  template<typename T2>
-  ac_complex &operator /=(const T2 &op2) {
-    _r = _r/op2;
-    _i = _i/op2;
-    return *this;
-  }
-
-  // Arithmetic Unary --------------------------------------------------------
-  ac_complex operator +() {
-    return *this;
-  }
-  typename rt_unary::neg operator -() const {
-    typename rt_unary::neg res(-_r, -_i);
-    return res;
-  }
-
-  // ! ------------------------------------------------------------------------
-  bool operator ! () const {
-    return !_r && !_i;
-  }
-
-  typename rt_unary::neg conj() const {
-    typename rt_unary::neg res(_r, -_i);
-    return res;
-  }
-
-  typename rt_unary::mag_sqr mag_sqr() const {
-    return _r*_r + _i*_i;
-  }
-
-  ac_complex< ac_int<2,true> > sign_conj() const {
-    return ac_complex< ac_int<2,true> >(
-      _r ? (_r < 0 ? -1 : 1) : 0,
-      _i ? (_i < 0 ? 1 : -1) : 0
-    );
-  }
-
-  inline static std::string type_name() {
-    typedef typename ac_private::map<T>::t map_T;
-    std::string r = "ac_complex<";
-    r += map_T::type_name();
-    r += '>';
-    return r;
-  }
-
-};
-
-namespace ac_private {
-  // with T2 == ac_complex
-  template<typename T2>
-  struct rt_ac_complex_T< ac_complex<T2> > {
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename ac::rt_2T<T,T2>::plus> plus;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::minus> minus;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::minus2> minus2;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::logic> logic;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::div> div;
-      typedef ac_complex<typename ac::rt_2T<T,T2>::div2> div2;
-      typedef ac_complex<typename ac::rt_2T<
-          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::plus,
-          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::minus
-        >::logic> mult;
-    };
-  };
-  // with T2 == ac_float
-  template< AC_FL_T0(2) >
-  struct rt_ac_complex_T< AC_FL0(2) > {
-    typedef AC_FL0(2) T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-  // with T2 == ac_fixed
-  template<int W2, int I2, bool S2>
-  struct rt_ac_complex_T< ac_fixed<W2,I2,S2> > {
-    typedef ac_fixed<W2,I2,S2> T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-  // with T2 == ac_int
-  template<int W2, bool S2>
-  struct rt_ac_complex_T< ac_int<W2,S2> > {
-    typedef ac_int<W2,S2> T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-  // with T2 == c_type<TC>
-  template<typename TC>
-  struct rt_ac_complex_T< c_type<TC> > {
-    typedef c_type<TC> T2;
-    template<typename T>
-    struct op1 {
-      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
-      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
-      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
-      typedef ac_complex<typename T::template rt_T<T2>::div> div;
-      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
-      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
-    };
-  };
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus operator +(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus res( op.r() + op2.r(), op.i() + op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::plus operator +(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::template rt_T<T>::plus res( op + op2.r(), op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::plus operator +(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::plus res( op.r() + op2, op.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus operator -(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus res( op.r() - op2.r(), op.i() - op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::minus2 operator -(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::template rt_T<T>::minus2 res( op - op2.r(), -op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::minus operator -(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::minus res( op.r() - op2, op.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult operator *(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult res( op.r()*op2.r() - op.i()*op2.i(), op.i()*op2.r() + op.r()*op2.i() );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::mult operator *(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::template rt_T<T>::mult res( op*op2.r(), op*op2.i());
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::mult operator *(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::mult res( op.r()*op2, op.i()*op2 );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::div operator /(const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
-  typename ac_complex<T>::template rt_T<ac_complex<T2> >::div res((op.r()*op2.r() + op.i()*op2.i())/d, (op.i()*op2.r() - op.r()*op2.i())/d);
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T>::template rt_T<T2>::div operator /(const ac_complex<T> &op, const T2 &op2) {
-  typename ac_complex<T>::template rt_T<T2>::div res( op.r()/op2, op.i()/op2 );
-  return res;
-}
-
-template<typename T, typename T2>
-inline typename ac_complex<T2>::template rt_T<T>::div2 operator /(const T &op, const ac_complex<T2> &op2) {
-  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
-  typename ac_complex<T2>::template rt_T<T>::div2 res(op*op2.r()/d, - op*op2.i()/d);
-  return res;
-}
-
-template<typename T, typename T2>
-inline bool operator == (const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  return op.r() == op2.r() && op.i() == op2.i();
-}
-
-template<typename T, typename T2>
-inline bool operator == (const T &op, const ac_complex<T2> &op2) {
-  return op == op2.r() && op2.i() == 0;
-}
-
-template<typename T, typename T2>
-inline bool operator == (const ac_complex<T> &op, const T2 &op2) {
-  return op.r() == op2 && op.i() == 0;
-}
-
-template<typename T, typename T2>
-inline bool operator != (const ac_complex<T> &op, const ac_complex<T2> &op2) {
-  return op.r() != op2.r() || op.i() != op2.i();
-}
-
-template<typename T, typename T2>
-inline bool operator != (const T &op, const ac_complex<T2> &op2) {
-  return op != op2.r() || op2.i() != 0;
-}
-
-template<typename T, typename T2>
-inline bool operator != (const ac_complex<T> &op, const T2 &op2) {
-  return op.r() != op2 || op.i() != 0;
-}
-
-// Stream --------------------------------------------------------------------
-
-template<typename T>
-inline std::ostream& operator << (std::ostream &os, const ac_complex<T> &x) {
-#ifndef __SYNTHESIS__
-  os << "(" << x.r() << ", " << x.i() << ")";
-#endif
-  return os;
-}
-
-template<ac_special_val V, typename T>
-inline ac_complex<T> value(ac_complex<T>) {
-  T val = value<V>((T) 0);
-  ac_complex<T> r(val, val);
-  return r;
-}
-
-namespace ac {
-  template<ac_special_val V, typename T>
-  inline bool init_array(ac_complex<T> *a, int n) {
-    T val = value<V>((T) 0);
-    ac_complex<T> t(val, val);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-}
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_COMPLEX_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2008-2019, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_complex.h
+//  Description:    complex type with parameterized type that can be:
+//                    - C integer types
+//                    - C floating point types
+//                    - ac_int
+//                    - ac_fixed
+//                    - ac_float
+//                  ac_complex based on C integers, ac_int, ac_fixed and ac_float can
+//                  be mixed
+//  Author:         Andres Takach, Ph.D.
+*/
+
+#ifndef __AC_COMPLEX_H
+#define __AC_COMPLEX_H
+
+#include <ac_float.h>
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+template<typename T> class ac_complex;
+
+namespace ac_private {
+  // specializations after definition of ac_complex
+  template<typename T>
+  struct rt_ac_complex_T {
+    template<typename T2>
+    struct op1 {
+      typedef typename T::template rt_T< ac_complex<T2> >::mult mult;
+      typedef typename T::template rt_T< ac_complex<T2> >::plus plus;
+      typedef typename T::template rt_T< ac_complex<T2> >::minus2 minus;
+      typedef typename T::template rt_T< ac_complex<T2> >::minus minus2;
+      typedef typename T::template rt_T< ac_complex<T2> >::logic logic;
+      typedef typename T::template rt_T< ac_complex<T2> >::div2 div;
+      typedef typename T::template rt_T< ac_complex<T2> >::div div2;
+    };
+  };
+}  // namespace ac_private
+
+template<typename T>
+class ac_complex {
+public:   // temporary workaround
+  T _r;
+  T _i;
+  typedef typename ac_private::map<T>::t map_T;
+  typedef typename map_T::rt_unary::mag_sqr T_sqr;
+  typedef typename ac_private::map<T_sqr>::t map_T_sqr;
+  typedef typename ac_private::map<typename map_T::rt_unary::mag>::t map_T_mag;
+public:
+  typedef T element_type;
+  template<typename T2>
+  struct rt_T {
+    typedef typename ac_private::map<T2>::t map_T2;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::mult mult;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::plus plus;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus minus;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::minus2 minus2;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::logic logic;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div div;
+    typedef typename ac_private::rt_ac_complex_T<map_T2>::template op1<map_T>::div2 div2;
+    typedef ac_complex<T> arg1;
+  };
+
+  struct rt_unary {
+    typedef typename map_T_sqr::template rt_T<map_T_sqr>::plus  mag_sqr;
+    typedef typename map_T_mag::template rt_T<map_T_mag>::plus  mag;   // overly conservative for signed
+    typedef ac_complex<typename map_T::rt_unary::neg>  neg;
+    template<unsigned N>
+    struct set {
+      typedef ac_complex<typename map_T::rt_unary::template set<N>::sum> sum;
+    };
+  };
+
+  ac_complex() { }
+  template<typename T2>
+  ac_complex(const ac_complex<T2> &c) : _r(c.r()), _i(c.i()) {}
+  template<typename T2>
+  ac_complex(const T2 &r) : _r(r), _i(0) {}
+  template<typename T2, typename T3>
+  ac_complex(const T2 &r, const T3 &i) : _r(r), _i(i) {}
+  const T &r() const { return _r; }
+  const T &i() const { return _i; }
+  T &r() { return _r; }
+  T &i() { return _i; }
+  const T &real() const { return _r; }
+  const T &imag() const { return _i; }
+  T &real() { return _r; }
+  T &imag() { return _i; }
+  template<typename T2>
+  void set_r(const T2 &r) { _r = r;}
+  template<typename T2>
+  void set_i(const T2 &i) { _i = i;}
+
+  // const binary operators are global rather than members because of compiler errors due to ambiguity
+  // (would appear as a compiler bug)
+
+  template<typename T2>
+  ac_complex &operator +=(const ac_complex<T2> &op2) {
+    _r += op2.r();
+    _i += op2.i();
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator +=(const T2 &op2) {
+    _r += op2;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator -=(const ac_complex<T2> &op2) {
+    _r -= op2.r();
+    _i -= op2.i();
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator -=(const T2 &op2) {
+    _r -= op2;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator *=(const ac_complex<T2> &op2) {
+    T r0 = _r*op2.r() - _i*op2.i();
+    _i = _r*op2.i() + _i*op2.r();
+    _r = r0;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator *=(const T2 &op2) {
+    _r = _r*op2;
+    _i = _i*op2;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator /=(const ac_complex<T2> &op2) {
+    typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
+    T r0 = (_r*op2.r() + _i*op2.i())/d;
+    _i = (_i*op2.r() - _r*op2.i())/d;
+    _r = r0;
+    return *this;
+  }
+
+  template<typename T2>
+  ac_complex &operator /=(const T2 &op2) {
+    _r = _r/op2;
+    _i = _i/op2;
+    return *this;
+  }
+
+  // Arithmetic Unary --------------------------------------------------------
+  ac_complex operator +() {
+    return *this;
+  }
+  typename rt_unary::neg operator -() const {
+    typename rt_unary::neg res(-_r, -_i);
+    return res;
+  }
+
+  // ! ------------------------------------------------------------------------
+  bool operator ! () const {
+    return !_r && !_i;
+  }
+
+  typename rt_unary::neg conj() const {
+    typename rt_unary::neg res(_r, -_i);
+    return res;
+  }
+
+  typename rt_unary::mag_sqr mag_sqr() const {
+    return _r*_r + _i*_i;
+  }
+
+  ac_complex< ac_int<2,true> > sign_conj() const {
+    return ac_complex< ac_int<2,true> >(
+      _r ? (_r < 0 ? -1 : 1) : 0,
+      _i ? (_i < 0 ? 1 : -1) : 0
+    );
+  }
+
+  inline static std::string type_name() {
+    typedef typename ac_private::map<T>::t map_T;
+    std::string r = "ac_complex<";
+    r += map_T::type_name();
+    r += '>';
+    return r;
+  }
+
+};
+
+namespace ac_private {
+  // with T2 == ac_complex
+  template<typename T2>
+  struct rt_ac_complex_T< ac_complex<T2> > {
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename ac::rt_2T<T,T2>::plus> plus;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::minus> minus;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::minus2> minus2;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::logic> logic;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::div> div;
+      typedef ac_complex<typename ac::rt_2T<T,T2>::div2> div2;
+      typedef ac_complex<typename ac::rt_2T<
+          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::plus,
+          typename ac::rt_2T<typename ac::rt_2T<T,T2>::mult, typename ac::rt_2T<T,T2>::mult>::minus
+        >::logic> mult;
+    };
+  };
+  // with T2 == ac_float
+  template< AC_FL_T0(2) >
+  struct rt_ac_complex_T< AC_FL0(2) > {
+    typedef AC_FL0(2) T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+  // with T2 == ac_fixed
+  template<int W2, int I2, bool S2>
+  struct rt_ac_complex_T< ac_fixed<W2,I2,S2> > {
+    typedef ac_fixed<W2,I2,S2> T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+  // with T2 == ac_int
+  template<int W2, bool S2>
+  struct rt_ac_complex_T< ac_int<W2,S2> > {
+    typedef ac_int<W2,S2> T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+  // with T2 == c_type<TC>
+  template<typename TC>
+  struct rt_ac_complex_T< c_type<TC> > {
+    typedef c_type<TC> T2;
+    template<typename T>
+    struct op1 {
+      typedef ac_complex<typename T::template rt_T<T2>::plus> plus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus> minus;
+      typedef ac_complex<typename T::template rt_T<T2>::minus2> minus2;
+      typedef ac_complex<typename T::template rt_T<T2>::logic> logic;
+      typedef ac_complex<typename T::template rt_T<T2>::div> div;
+      typedef ac_complex<typename T::template rt_T<T2>::div2> div2;
+      typedef ac_complex<typename T::template rt_T<T2>::mult> mult;
+    };
+  };
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus operator +(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::plus res( op.r() + op2.r(), op.i() + op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::plus operator +(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::template rt_T<T>::plus res( op + op2.r(), op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::plus operator +(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::plus res( op.r() + op2, op.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus operator -(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::minus res( op.r() - op2.r(), op.i() - op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::minus2 operator -(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::template rt_T<T>::minus2 res( op - op2.r(), -op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::minus operator -(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::minus res( op.r() - op2, op.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult operator *(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::mult res( op.r()*op2.r() - op.i()*op2.i(), op.i()*op2.r() + op.r()*op2.i() );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::mult operator *(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::template rt_T<T>::mult res( op*op2.r(), op*op2.i());
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::mult operator *(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::mult res( op.r()*op2, op.i()*op2 );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<ac_complex<T2> >::div operator /(const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
+  typename ac_complex<T>::template rt_T<ac_complex<T2> >::div res((op.r()*op2.r() + op.i()*op2.i())/d, (op.i()*op2.r() - op.r()*op2.i())/d);
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T>::template rt_T<T2>::div operator /(const ac_complex<T> &op, const T2 &op2) {
+  typename ac_complex<T>::template rt_T<T2>::div res( op.r()/op2, op.i()/op2 );
+  return res;
+}
+
+template<typename T, typename T2>
+inline typename ac_complex<T2>::template rt_T<T>::div2 operator /(const T &op, const ac_complex<T2> &op2) {
+  typename ac_complex<T2>::rt_unary::mag_sqr d = op2.mag_sqr();
+  typename ac_complex<T2>::template rt_T<T>::div2 res(op*op2.r()/d, - op*op2.i()/d);
+  return res;
+}
+
+template<typename T, typename T2>
+inline bool operator == (const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  return op.r() == op2.r() && op.i() == op2.i();
+}
+
+template<typename T, typename T2>
+inline bool operator == (const T &op, const ac_complex<T2> &op2) {
+  return op == op2.r() && op2.i() == 0;
+}
+
+template<typename T, typename T2>
+inline bool operator == (const ac_complex<T> &op, const T2 &op2) {
+  return op.r() == op2 && op.i() == 0;
+}
+
+template<typename T, typename T2>
+inline bool operator != (const ac_complex<T> &op, const ac_complex<T2> &op2) {
+  return op.r() != op2.r() || op.i() != op2.i();
+}
+
+template<typename T, typename T2>
+inline bool operator != (const T &op, const ac_complex<T2> &op2) {
+  return op != op2.r() || op2.i() != 0;
+}
+
+template<typename T, typename T2>
+inline bool operator != (const ac_complex<T> &op, const T2 &op2) {
+  return op.r() != op2 || op.i() != 0;
+}
+
+// Stream --------------------------------------------------------------------
+
+template<typename T>
+inline std::ostream& operator << (std::ostream &os, const ac_complex<T> &x) {
+#ifndef __SYNTHESIS__
+  os << "(" << x.r() << ", " << x.i() << ")";
+#endif
+  return os;
+}
+
+template<ac_special_val V, typename T>
+inline ac_complex<T> value(ac_complex<T>) {
+  T val = value<V>((T) 0);
+  ac_complex<T> r(val, val);
+  return r;
+}
+
+namespace ac {
+  template<ac_special_val V, typename T>
+  inline bool init_array(ac_complex<T> *a, int n) {
+    T val = value<V>((T) 0);
+    ac_complex<T> t(val, val);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+}
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_COMPLEX_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_fixed.h b/hls4ml/templates/quartus/ac_types/ac_fixed.h
index 458cbddee6..cb95db8d16 100644
--- a/hls4ml/templates/quartus/ac_types/ac_fixed.h
+++ b/hls4ml/templates/quartus/ac_types/ac_fixed.h
@@ -1,1546 +1,1546 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2005-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_fixed.h
-//  Description:    class for fixed point operation handling in C++
-//  Author:         Andres Takach, Ph.D.
-*/
-
-#ifndef __AC_FIXED_H
-#define __AC_FIXED_H
-
-#include "ac_int.h"
-
-#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
-#error GCC version 3 or greater is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
-#error Microsoft Visual Studio 8 or newer is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4127 4308 4365 4514 4800 )
-#endif
-
-#ifndef __SYNTHESIS__
-#ifndef __AC_FIXED_UTILITY_BASE
-#define __AC_FIXED_UTILITY_BASE
-#endif
-
-#endif
-
-#ifdef __SYNTHESIS__
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-#undef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-#endif
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-namespace ac_private {
-  template<typename T>
-  struct rt_ac_fixed_T {
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::mult mult;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::plus plus;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus2 minus;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus minus2;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::logic logic;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div2 div;
-      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div div2;
-    };
-  };
-  // specializations after definition of ac_fixed
-}
-
-namespace ac {
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  class basic_num_ovf_base {
-    bool d_enable;
-  public:
-    basic_num_ovf_base() : d_enable(true) {}
-    void enable_ovf(bool a) { d_enable = a; }
-    bool is_enabled() const { return d_enable; }
-    template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-    void update(bool overflow, bool neg, const basic_num_ovf_base<W2,I2,S2,Q2,O2> &op2) {
-#ifndef __AC_OVERRIDE_OVF_UPDATE_BODY
-      if(d_enable) {
-        if(overflow) {
-          std::cerr << (neg ? "-" : "+") << "OVF: ";
-          std::cerr << type_name() << " ( " << basic_num_ovf_base<W2,I2,S2,Q2,O2>::type_name(); 
-          std::cerr << " ( " << op2.value().to_double() << " ) )" << std::endl;
-        }
-      }
-#else
-      __AC_OVERRIDE_OVF_UPDATE_BODY
-#endif
-    }
-    void update(bool overflow, bool neg, double op2) {
-#ifndef __AC_OVERRIDE_OVF_UPDATE2_BODY
-      if(d_enable) {
-        if(overflow) {
-          std::cerr << (neg ? "-" : "+") << "OVF: ";
-          std::cerr << type_name() << " ( " << "double"; 
-          std::cerr << " ( " << op2 << " ) )" << std::endl;
-        }
-      }
-#else
-      __AC_OVERRIDE_OVF_UPDATE2_BODY
-#endif
-    }
-    const ac_fixed<W,I,S,Q,O> &value() const;
-    static std::string type_name();
-  };
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//  ac_fixed
-//////////////////////////////////////////////////////////////////////////////
-
-//enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
-//enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
-
-template<int W, int I, bool S=true, ac_q_mode Q=AC_TRN, ac_o_mode O=AC_WRAP>
-class ac_fixed : private ac_private::iv<(W+31+!S)/32>
-#ifndef __SYNTHESIS__
-__AC_FIXED_UTILITY_BASE
-#endif
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-, public __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-#endif
-{
-#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-#pragma builtin
-#endif
-
-  enum {N=(W+31+!S)/32};
-
-  template<int W2>
-  struct rt_priv {
-    enum {w_shiftl = AC_MAX(W+W2,1) };
-    typedef ac_fixed<w_shiftl, I, S> shiftl;
-  };
-
-  typedef ac_private::iv<N> Base;
-
-  inline void bit_adjust() {
-    const unsigned rem = (32-W)&31;
-    Base::v[N-1] =  S ? ((signed) ((unsigned)Base::v[N-1]  << rem) >> rem) : (rem ?
-                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
-  }
-  inline Base &base() { return *this; }
-  inline const Base &base() const { return *this; }
-
-  inline void overflow_adjust(bool overflow, bool neg) {
-    if(O==AC_WRAP) {
-      bit_adjust();
-      return;
-    }
-    else if(O==AC_SAT_ZERO) {
-      if(overflow)
-        ac_private::iv_extend<N>(Base::v, 0);
-      else
-        bit_adjust();
-    }
-    else if(S) {
-      if(overflow) {
-        if(!neg) {
-          ac_private::iv_extend<N-1>(Base::v, ~0);
-          Base::v[N-1] = ~((unsigned)~0 << ((W-1)&31));
-        } else {
-          ac_private::iv_extend<N-1>(Base::v, 0);
-          Base::v[N-1] = ((unsigned)~0 << ((W-1)&31));
-          if(O==AC_SAT_SYM)
-            Base::v[0] |= 1;
-        }
-      } else
-        bit_adjust();
-    }
-    else {
-      if(overflow) {
-        if(!neg) {
-          ac_private::iv_extend<N-1>(Base::v, ~0);
-          Base::v[N-1] = ~((unsigned)~0 << (W&31));
-        } else
-          ac_private::iv_extend<N>(Base::v, 0);
-      } else
-        bit_adjust();
-    }
-  }
-
-  inline bool quantization_adjust(bool qb, bool r, bool s) {
-    if(Q==AC_TRN)
-      return false;
-    if(Q==AC_RND_ZERO)
-      qb &= s || r;
-    else if(Q==AC_RND_MIN_INF)
-      qb &= r;
-    else if(Q==AC_RND_INF)
-      qb &= !s || r;
-    else if(Q==AC_RND_CONV)
-      qb &= (Base::v[0] & 1) || r;
-    else if(Q==AC_RND_CONV_ODD)
-      qb &= (!(Base::v[0] & 1)) || r;
-    else if(Q==AC_TRN_ZERO)
-      qb = s && ( qb || r );
-    return ac_private::iv_uadd_carry<N>(Base::v, qb, Base::v);
-  }
-
-  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
-
-public:
-  static const int width = W;
-  static const int i_width = I;
-  static const bool sign = S;
-  static const ac_o_mode o_mode = O;
-  static const ac_q_mode q_mode = Q;
-  static const int e_width = 0;
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-  static const bool compute_overflow_for_wrap = true;
-#else
-  static const bool compute_overflow_for_wrap = false;
-#endif
-
-  template<int W2, int I2, bool S2>
-  struct rt {
-    enum {
-      F=W-I,
-      F2=W2-I2,
-      mult_w = W+W2,
-      mult_i = I+I2,
-      mult_s = S||S2,
-      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      plus_s = S||S2,
-      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      minus_s = true,
-      div_w = W+AC_MAX(W2-I2,0)+S2,
-      div_i = I+(W2-I2)+S2,
-      div_s = S||S2,
-      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
-      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
-      logic_s = S||S2
-    };
-    typedef ac_fixed<mult_w, mult_i, mult_s> mult;
-    typedef ac_fixed<plus_w, plus_i, plus_s> plus;
-    typedef ac_fixed<minus_w, minus_i, minus_s> minus;
-    typedef ac_fixed<logic_w, logic_i, logic_s> logic;
-    typedef ac_fixed<div_w, div_i, div_s> div;
-    typedef ac_fixed<W, I, S> arg1;
-  };
-
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::mult mult;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::plus plus;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus minus;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus2 minus2;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::logic logic;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div div;
-    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div2 div2;
-    typedef ac_fixed<W, I, S> arg1;
-  };
-
-  struct rt_unary {
-    enum {
-      neg_w = W+1,
-      neg_i = I+1,
-      neg_s = true,
-      mag_sqr_w = 2*W-S,
-      mag_sqr_i = 2*I-S,
-      mag_sqr_s = false,
-      mag_w = W+S,
-      mag_i = I+S,
-      mag_s = false,
-      leading_sign_w = ac::log2_ceil<W+!S>::val,
-      leading_sign_s = false
-    };
-    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
-    typedef ac_fixed<neg_w, neg_i, neg_s> neg;
-    typedef ac_fixed<mag_sqr_w, mag_sqr_i, mag_sqr_s> mag_sqr;
-    typedef ac_fixed<mag_w, mag_i, mag_s> mag;
-    template<unsigned N>
-    struct set {
-      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_s = S};
-      typedef ac_fixed<sum_w, sum_i, sum_s> sum;
-    };
-  };
-
-  ac_fixed(const ac_fixed &op): Base(op) { }
-
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
-  ac_fixed() {
-#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
-    bit_adjust();
-    if( O==AC_SAT_SYM && S && Base::v[N-1] < 0 && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true) )
-      Base::v[0] |= 1;
-#endif
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  inline ac_fixed (const ac_fixed<W2,I2,S2,Q2,O2> &op) {
-    enum {N2=(W2+31+!S2)/32, F=W-I, F2=W2-I2, QUAN_INC = F2>F && !(Q==AC_TRN || (Q==AC_TRN_ZERO && !S2)) };
-    bool carry = false;
-    // handle quantization
-    if(F2 == F)
-      Base::operator =(op);
-    else if(F2 > F) {
-      op.template const_shift_r<N,F2-F>(*this);
-//      ac_private::iv_const_shift_r<N2,N,F2-F>(op.v, Base::v);
-      if(Q!=AC_TRN && !(Q==AC_TRN_ZERO && !S2)) {
-        bool qb = (F2-F > W2) ? (op.v[N2-1] < 0) : (bool) op[F2-F-1];
-        bool r = (F2 > F+1) ? !ac_private::iv_equal_zeros_to<F2-F-1,N2>(op.v) : false;
-        carry = quantization_adjust(qb, r, S2 && op.v[N2-1] < 0);
-      }
-    }
-    else  // no quantization
-      op.template const_shift_l<N,F-F2>(*this);
-//      ac_private::iv_const_shift_l<N2,N,F-F2>(op.v, Base::v);
-    // handle overflow
-    if((O!=AC_WRAP || compute_overflow_for_wrap)
-       && ((!S && S2) || I-S < I2-S2+(QUAN_INC || (S2 && O==AC_SAT_SYM && (O2 != AC_SAT_SYM || F2 > F) )))
-    ) { // saturation
-      bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
-      bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
-      bool neg_src;
-      if(F2-F+32*N < W2) {
-        bool all_ones = ac_private::iv_equal_ones_from<F2-F+32*N,N2>(op.v);
-        deleted_bits_zero = deleted_bits_zero && (carry ? all_ones : ac_private::iv_equal_zeros_from<F2-F+32*N,N2>(op.v));
-        deleted_bits_one = deleted_bits_one && (carry ? ac_private::iv_equal_ones_from<1+F2-F+32*N,N2>(op.v) && !op[F2-F+32*N] : all_ones);
-        neg_src = S2 && op.v[N2-1] < 0 && !(carry & all_ones);
-      }
-      else
-        neg_src = S2 && op.v[N2-1] < 0 && Base::v[N-1] < 0;
-      bool neg_trg = S && (bool) this->operator[](W-1);
-      bool overflow = !neg_src && (neg_trg || !deleted_bits_zero);
-      overflow |= neg_src && (!neg_trg || !deleted_bits_one);
-      if(O==AC_SAT_SYM && S && S2)
-        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
-      overflow_adjust(overflow, neg_src);
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-    __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,op);
-#endif
-    }
-    else
-      bit_adjust();
-  }
-
-  template<int W2, bool S2>
-  inline ac_fixed (const ac_int<W2,S2> &op) {
-    ac_fixed<W2,W2,S2> f_op;
-    f_op.base().operator =(op);
-    *this = f_op;
-  }
-
-  template<int W2>
-  typename rt_priv<W2>::shiftl shiftl() const {
-    typedef typename rt_priv<W2>::shiftl shiftl_t;
-    shiftl_t r;
-    Base::template const_shift_l<shiftl_t::N,W2>(r);
-    return r;
-  }
-
-  inline ac_fixed( bool b ) { *this = (ac_int<1,false>) b; }
-  inline ac_fixed( char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_fixed( signed char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_fixed( unsigned char b ) { *this = (ac_int<8,false>) b; }
-  inline ac_fixed( signed short b ) { *this = (ac_int<16,true>) b; }
-  inline ac_fixed( unsigned short b ) { *this = (ac_int<16,false>) b; }
-  inline ac_fixed( signed int b ) { *this = (ac_int<32,true>) b; }
-  inline ac_fixed( unsigned int b ) { *this = (ac_int<32,false>) b; }
-  inline ac_fixed( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
-  inline ac_fixed( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
-  inline ac_fixed( Slong b ) { *this = (ac_int<64,true>) b; }
-  inline ac_fixed( Ulong b ) { *this = (ac_int<64,false>) b; }
-
-  inline ac_fixed( double d ) {
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool o, qb, r;
-    bool neg_src = d < 0;
-    Base::conv_from_fraction(di, &qb, &r, &o);
-    quantization_adjust(qb, r, neg_src);
-    // a neg number may become non neg (0) after quantization
-    neg_src &= o || Base::v[N-1] < 0;
-
-    if(O!=AC_WRAP || compute_overflow_for_wrap) { // saturation
-      bool overflow;
-      bool neg_trg = S && (bool) this->operator[](W-1);
-      if(o) {
-        overflow = true;
-      } else {
-        bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
-        bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
-        overflow = !neg_src && (neg_trg || !deleted_bits_zero);
-        overflow |= neg_src && (!neg_trg || !deleted_bits_one);
-      }
-      if(O==AC_SAT_SYM && S)
-        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
-      overflow_adjust(overflow, neg_src);
-#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
-      __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,d);
-#endif
-    } else
-      bit_adjust();
-  }
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-  template<ac_special_val V>
-  inline ac_fixed &set_val() {
-    if(V == AC_VAL_DC) {
-      ac_fixed r;
-      Base::operator =(r);
-      bit_adjust();
-    }
-    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-      Base::operator =(0);
-      if(S && V == AC_VAL_MIN) {
-        const unsigned rem = (W-1)&31;
-        Base::v[N-1] = ((unsigned)~0 << rem);
-        if(O == AC_SAT_SYM) {
-          if(W == 1)
-            Base::v[0] = 0;
-          else
-            Base::v[0] |= 1;
-        }
-      } else if(V == AC_VAL_QUANTUM)
-        Base::v[0] = 1;
-    }
-    else {  // AC_VAL_MAX
-      Base::operator =(-1);
-      const unsigned int rem = (32-W - (unsigned) !S )&31;
-      Base::v[N-1] = ((unsigned) (-1) >> 1) >> rem;
-    }
-    return *this;
-  }
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-  // Explicit conversion functions to ac_int that captures all integer bits (bits are truncated)
-  inline ac_int<AC_MAX(I,1),S> to_ac_int() const { return ((ac_fixed<AC_MAX(I,1),AC_MAX(I,1),S>) *this).template slc<AC_MAX(I,1)>(0); }
-
-  // Explicit conversion functions to C built-in types -------------
-  inline int to_int() const { return ((I-W) >= 32) ? 0 : (signed int) to_ac_int(); }
-  inline unsigned to_uint() const { return ((I-W) >= 32) ? 0 : (unsigned int) to_ac_int(); }
-  inline long to_long() const { return ((I-W) >= ac_private::long_w) ? 0 : (signed long) to_ac_int(); }
-  inline unsigned long to_ulong() const { return ((I-W) >= ac_private::long_w) ? 0 : (unsigned long) to_ac_int(); }
-  inline Slong to_int64() const { return ((I-W) >= 64) ? 0 : (Slong) to_ac_int(); }
-  inline Ulong to_uint64() const { return ((I-W) >= 64) ? 0 : (Ulong) to_ac_int(); }
-  inline double to_double() const { return ac_private::ldexpr<I-W>(Base::to_double()); }
-
-  inline int length() const { return W; }
-
-  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
-    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
-    char r[(W-AC_MIN(AC_MIN(W-I,I),0)+31)/32*32+5] = {0};
-    int i = 0;
-    if(sign_mag)
-      r[i++] = is_neg() ? '-' : '+';
-    else if (base_rep == AC_DEC && is_neg())
-      r[i++] = '-';
-    if(base_rep != AC_DEC) {
-      r[i++] = '0';
-      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
-    }
-    ac_fixed<W+1, I+1, true> t;
-    if( (base_rep == AC_DEC || sign_mag) && is_neg() )
-      t = operator -();
-    else
-      t = *this;
-    ac_fixed<AC_MAX(I+1,1),AC_MAX(I+1,1),true> i_part = t;
-    ac_fixed<AC_MAX(W-I,1),0,false> f_part = t;
-    i += ac_private::to_string(i_part.v, AC_MAX(I+1,1), sign_mag, base_rep, false, r+i);
-    if(W-I > 0) {
-      r[i++] = '.';
-      if(!ac_private::to_string(f_part.v, W-I, false, base_rep, true, r+i))
-        r[--i] = 0;
-    }
-    if(!i) {
-      r[0] = '0';
-      r[1] = 0;
-    }
-    return std::string(r);
-  }
-  inline static std::string type_name() {
-    const char *tf[] = {"false", "true" };
-    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV", "AC_RND_CONV_ODD" };
-    const char *o[] = {"AC_WRAP", "AC_SAT", "AC_SAT_ZERO", "AC_SAT_SYM" };
-    std::string r = "ac_fixed<";
-    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
-    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
-    r += tf[S];
-    r += ',';
-    r += q[Q];
-    r += ',';
-    r += o[O];
-    r += '>';
-    return r;
-  }
-
-  // Arithmetic : Binary ----------------------------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::mult operator *( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    typename rt<W2,I2,S2>::mult r;
-    Base::mult(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::plus operator +( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::plus r;
-    if(F == F2)
-      Base::add(op2, r);
-    else if(F > F2)
-      Base::add(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().add(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::minus operator -( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::minus r;
-    if(F == F2)
-      Base::sub(op2, r);
-    else if(F > F2)
-      Base::sub(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().sub(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wenum-compare"
-#endif
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::div operator /( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    typename rt<W2,I2,S2>::div r;
-    enum { Num_w = W+AC_MAX(W2-I2,0), Num_i = I, Num_w_minus = Num_w+S, Num_i_minus = Num_i+S,
-          N1 = ac_fixed<Num_w,Num_i,S>::N, N1minus = ac_fixed<Num_w_minus,Num_i_minus,S>::N,
-          N2 = ac_fixed<W2,I2,S2>::N, N2minus = ac_fixed<W2+S2,I2+S2,S2>::N,
-          num_s = S + (N1minus > N1), den_s = S2 + (N2minus > N2), Nr = rt<W2,I2,S2>::div::N };
-    ac_fixed<Num_w, Num_i, S> t = *this;
-    t.template div<num_s, N2, den_s, Nr>(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-  // Arithmetic assign  ------------------------------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator *=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator *(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator +=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator +(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator -=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator -(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator /=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
-    *this = this->operator /(op2);
-    return *this;
-  }
-  // increment/decrement by quantum (smallest difference that can be represented)
-  // Arithmetic prefix increment, decrement ---------------------------------
-  ac_fixed &operator ++() {
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator += (q);
-    return *this;
-  }
-  ac_fixed &operator --() {
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator -= (q);
-    return *this;
-  }
-  // Arithmetic postfix increment, decrement ---------------------------------
-  const ac_fixed operator ++(int) {
-    ac_fixed t = *this;
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator += (q);
-    return t;
-  }
-  const ac_fixed operator --(int) {
-    ac_fixed t = *this;
-    ac_fixed<1,I-W+1,false> q;
-    q.template set_val<AC_VAL_QUANTUM>();
-    operator -= (q);
-    return t;
-  }
-  // Arithmetic Unary --------------------------------------------------------
-  ac_fixed operator +() {
-    return *this;
-  }
-  typename rt_unary::neg operator -() const {
-    typename rt_unary::neg r;
-    Base::neg(r);
-    r.bit_adjust();
-    return r;
-  }
-  // ! ------------------------------------------------------------------------
-  bool operator ! () const {
-    return Base::equal_zero();
-  }
-
-  // Bitwise (arithmetic) unary: complement  -----------------------------
-  ac_fixed<W+!S, I+!S, true> operator ~() const {
-    ac_fixed<W+!S, I+!S, true> r;
-    Base::bitwise_complement(r);
-    return r;
-  }
-  // Bitwise (not arithmetic) bit complement  -----------------------------
-  ac_fixed<W, I, false> bit_complement() const {
-    ac_fixed<W, I, false> r;
-    Base::bitwise_complement(r);
-    r.bit_adjust();
-    return r;
-  }
-  // Bitwise (not arithmetic): and, or, xor ----------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::logic operator &( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::logic r;
-    if(F == F2)
-      Base::bitwise_and(op2, r);
-    else if(F > F2)
-      Base::bitwise_and(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().bitwise_and(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::logic operator |( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::logic r;
-    if(F == F2)
-      Base::bitwise_or(op2, r);
-    else if(F > F2)
-      Base::bitwise_or(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().bitwise_or(op2, r);
-    return r;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  typename rt<W2,I2,S2>::logic operator ^( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    typename rt<W2,I2,S2>::logic r;
-    if(F == F2)
-      Base::bitwise_xor(op2, r);
-    else if(F > F2)
-      Base::bitwise_xor(op2.template shiftl<F-F2>(), r);
-    else
-      shiftl<F2-F>().bitwise_xor(op2, r);
-    return r;
-  }
-  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator &= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
-    *this = this->operator &(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator |= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
-    *this = this->operator |(op2);
-    return *this;
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  ac_fixed &operator ^= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
-    *this = this->operator ^(op2);
-    return *this;
-  }
-  // Shift (result constrained by left operand) -------------------------------
-  template<int W2>
-  ac_fixed operator << ( const ac_int<W2,true> &op2 ) const {
-    // currently not written to overflow or quantize (neg shift)
-    ac_fixed r;
-    Base::shift_l2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_fixed operator << ( const ac_int<W2,false> &op2 ) const {
-    // currently not written to overflow
-    ac_fixed r;
-    Base::shift_l(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_fixed operator >> ( const ac_int<W2,true> &op2 ) const {
-    // currently not written to quantize or overflow (neg shift)
-    ac_fixed r;
-    Base::shift_r2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_fixed operator >> ( const ac_int<W2,false> &op2 ) const {
-    // currently not written to quantize
-    ac_fixed r;
-    Base::shift_r(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  // Shift assign ------------------------------------------------------------
-  template<int W2>
-  ac_fixed operator <<= ( const ac_int<W2,true> &op2 ) {
-    // currently not written to overflow or quantize (neg shift)
-    Base r;
-    Base::shift_l2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_fixed operator <<= ( const ac_int<W2,false> &op2 ) {
-    // currently not written to overflow
-    Base r;
-    Base::shift_l(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_fixed operator >>= ( const ac_int<W2,true> &op2 ) {
-    // currently not written to quantize or overflow (neg shift)
-    Base r;
-    Base::shift_r2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_fixed operator >>= ( const ac_int<W2,false> &op2 ) {
-    // currently not written to quantize
-    Base r;
-    Base::shift_r(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  // Relational ---------------------------------------------------------------
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator == ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return Base::equal(op2);
-    else if(F > F2)
-      return Base::equal(op2.template shiftl<F-F2>());
-    else
-      return shiftl<F2-F>().equal(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator != ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return ! Base::equal(op2);
-    else if(F > F2)
-      return ! Base::equal(op2.template shiftl<F-F2>());
-    else
-      return ! shiftl<F2-F>().equal(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator < ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return Base::less_than(op2);
-    else if(F > F2)
-      return Base::less_than(op2.template shiftl<F-F2>());
-    else
-      return shiftl<F2-F>().less_than(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator >= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return ! Base::less_than(op2);
-    else if(F > F2)
-      return ! Base::less_than(op2.template shiftl<F-F2>());
-    else
-      return ! shiftl<F2-F>().less_than(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator > ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return Base::greater_than(op2);
-    else if(F > F2)
-      return Base::greater_than(op2.template shiftl<F-F2>());
-    else
-      return shiftl<F2-F>().greater_than(op2);
-  }
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
-  bool operator <= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
-    enum { F=W-I, F2=W2-I2 };
-    if(F == F2)
-      return ! Base::greater_than(op2);
-    else if(F > F2)
-      return ! Base::greater_than(op2.template shiftl<F-F2>());
-    else
-      return ! shiftl<F2-F>().greater_than(op2);
-  }
-  bool operator == ( double d) const {
-    if(is_neg() != (d < 0.0))
-      return false;
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool overflow, qb, r;
-    ac_fixed<W,I,S> t;
-    t.conv_from_fraction(di, &qb, &r, &overflow);
-    if(qb || r || overflow)
-      return false;
-    return operator == (t);
-  }
-  bool operator != ( double d) const {
-    return !operator == ( d );
-  }
-  bool operator < ( double d) const {
-    if(is_neg() != (d < 0.0))
-      return is_neg();
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool overflow, qb, r;
-    ac_fixed<W,I,S> t;
-    t.conv_from_fraction(di, &qb, &r, &overflow);
-    if(is_neg() && overflow)
-      return false;
-    return (!is_neg() && overflow) || ((qb || r) && operator <= (t)) || operator < (t);
-  }
-  bool operator >= ( double d) const {
-    return !operator < ( d );
-  }
-  bool operator > ( double d) const {
-    if(is_neg() != (d < 0.0))
-      return !is_neg();
-    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
-    bool overflow, qb, r;
-    ac_fixed<W,I,S> t;
-    t.conv_from_fraction(di, &qb, &r, &overflow);
-    if(!is_neg() && overflow )
-      return false;
-    return (is_neg() && overflow) || operator > (t);
-  }
-  bool operator <= ( double d) const {
-    return !operator > ( d );
-  }
-
-  // Bit and Slice Select -----------------------------------------------------
-  template<int WS, int WX, bool SX>
-  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int WS>
-  inline const ac_int<WS,S> slc(signed index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int WS>
-  inline const ac_int<WS,S> slc(unsigned uindex) const {
-    ac_int<WS,S> r;
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int W2, bool S2, int WX, bool SX>
-  inline ac_fixed &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_fixed &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_fixed &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-
-  template<int Msb, int Lsb>
-  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
-    #if __cplusplus > 199711L
-    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
-    static_assert(Lsb >= 0, "LSB is negative");
-    static_assert(Msb < W, "MSB >= W");
-    #endif
-    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
-  }
-
-  class ac_bitref {
-# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-# pragma builtin
-# endif
-    ac_fixed &d_bv;
-    unsigned d_index;
-  public:
-    ac_bitref( ac_fixed *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
-    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
-
-    inline ac_bitref operator = ( int val ) {
-      // lsb of int (val&1) is written to bit
-      if(d_index < W) {
-        int *pval = &d_bv.v[d_index>>5];
-        *pval ^= (*pval ^ ((unsigned) val << (d_index&31) )) & 1 << (d_index&31);
-        d_bv.bit_adjust();   // in case sign bit was assigned
-      }
-      return *this;
-    }
-    template<int W2, bool S2>
-    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
-      return operator =(val.to_int());
-    }
-    inline ac_bitref operator = ( const ac_bitref &val ) {
-      return operator =((int) (bool) val);
-    }
-  };
-
-  ac_bitref operator [] ( unsigned int uindex) {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  ac_bitref operator [] ( int index) {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  template<int W2, bool S2>
-  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-
-  bool operator [] ( unsigned int uindex) const {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  bool operator [] ( int index) const {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  template<int W2, bool S2>
-  bool operator [] ( const ac_int<W2,S2> &index) const {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  typename rt_unary::leading_sign leading_sign() const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    return ls;
-  }
-  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    all_sign = (ls == W-S);
-    return ls;
-  }
-  // returns false if number is denormal
-  template<int WE, bool SE>
-  bool normalize(ac_int<WE,SE> &exp) {
-    ac_int<W,S> m = this->template slc<W>(0);
-    bool r = m.normalize(exp);
-    this->set_slc(0,m);
-    return r;
-  }
-  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
-  template<int WE, bool SE>
-  bool normalize_RME(ac_int<WE,SE> &exp) {
-    ac_int<W,S> m = this->template slc<W>(0);
-    bool r = m.normalize_RME(exp);
-    this->set_slc(0,m);
-    return r;
-  }
-  inline void bit_fill_hex(const char *str) {
-    // Zero Pads if str is too short, throws ms bits away if str is too long
-    // Asserts if anything other than 0-9a-fA-F is encountered
-    ac_int<W,S> x;
-    x.bit_fill_hex(str);
-    set_slc(0, x);
-  }
-  template<int N>
-  inline void bit_fill(const int (&ivec)[N], bool bigendian=true) {
-    // bit_fill from integer vector
-    //   if W > N*32, missing most significant bits are zeroed
-    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
-    //
-    // Example:
-    //   ac_fixed<80,40,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
-    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
-    ac_int<W,S> x;
-    x.bit_fill(ivec, bigendian);
-    set_slc(0, x);
-  }
-};
-
-namespace ac {
-  template<typename T>
-  struct ac_fixed_represent {
-    enum { t_w = ac_private::c_type_params<T>::W, t_i = t_w, t_s = ac_private::c_type_params<T>::S };
-    typedef ac_fixed<t_w,t_i,t_s> type;
-  };
-  template<> struct ac_fixed_represent<float> {};
-  template<> struct ac_fixed_represent<double> {};
-  template<int W, bool S>
-  struct ac_fixed_represent< ac_int<W,S> > {
-    typedef ac_fixed<W,W,S> type;
-  };
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  struct ac_fixed_represent< ac_fixed<W,I,S,Q,O> > {
-    typedef ac_fixed<W,I,S,Q,O> type;
-  };
-}
-
-namespace ac_private {
-  // with T == ac_fixed
-  template<int W2, int I2, bool S2>
-  struct rt_ac_fixed_T< ac_fixed<W2,I2,S2> > {
-    typedef ac_fixed<W2,I2,S2> fx2_t;
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef ac_fixed<W,I,S> fx_t;
-      typedef typename fx_t::template rt<W2,I2,S2>::mult mult;
-      typedef typename fx_t::template rt<W2,I2,S2>::plus plus;
-      typedef typename fx_t::template rt<W2,I2,S2>::minus minus;
-      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
-      typedef typename fx_t::template rt<W2,I2,S2>::logic logic;
-      typedef typename fx_t::template rt<W2,I2,S2>::div div;
-      typedef typename fx2_t::template rt<W,I,S>::div div2;
-    };
-  };
-  // with T == ac_int
-  template<int W2, bool S2>
-  struct rt_ac_fixed_T< ac_int<W2,S2> > {
-    typedef ac_fixed<W2,W2,S2> fx2_t;
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef ac_fixed<W,I,S> fx_t;
-      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
-      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
-      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
-      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
-      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
-      typedef typename fx_t::template rt<W2,W2,S2>::div div;
-      typedef typename fx2_t::template rt<W,I,S>::div div2;
-    };
-  };
-
-  template<typename T>
-  struct rt_ac_fixed_T< c_type<T> > {
-    typedef typename ac::ac_fixed_represent<T>::type fx2_t;
-    enum { W2 = fx2_t::width, I2 = W2, S2 = fx2_t::sign };
-    template<int W, int I, bool S>
-    struct op1 {
-      typedef ac_fixed<W,I,S> fx_t;
-      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
-      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
-      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
-      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
-      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
-      typedef typename fx_t::template rt<W2,W2,S2>::div div;
-      typedef typename fx2_t::template rt<W,I,S>::div div2;
-    };
-  };
-}
-
-
-// Specializations for constructors on integers that bypass bit adjusting
-//  and are therefore more efficient
-template<> inline ac_fixed<1,1,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b ? -1 : 0; }
-
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed long b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned long b ) { v[0] = b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b&1; }
-template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b&1; }
-
-template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
-template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = (signed char) b; }
-template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned char) b; }
-
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned short) b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b; }
-template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = (signed short) b; }
-template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = (unsigned short) b; }
-
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; }
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; }
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; v[1] = 0;}
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; v[1] = 0;}
-
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; }
-template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; }
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = 0;}
-template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = 0;}
-
-template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
-template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
-template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
-template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
-
-
-// Stream --------------------------------------------------------------------
-
-template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline std::ostream& operator << (std::ostream &os, const ac_fixed<W,I,S,Q,O> &x) {
-#ifndef __SYNTHESIS__
-  if ((os.flags() & std::ios::hex) != 0) {
-    os << x.to_string(AC_HEX);
-  } else if ((os.flags() & std::ios::oct) != 0) {
-    os << x.to_string(AC_OCT);
-  } else {
-    os << x.to_string(AC_DEC);
-  }
-#endif
-  return os;
-}
-
-
-// Macros for Binary Operators with C Integers --------------------------------------------
-
-#define FX_BIN_OP_WITH_INT_2I(BIN_OP, C_TYPE, WI, SI)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline ac_fixed<W,I,S,Q,O> operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
-    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
-  }
-
-#define FX_BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
-  } \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
-    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
-  }
-
-#define FX_REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
-    return op.operator REL_OP (ac_fixed<W2,W2,S2>(op2));  \
-  }  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline bool operator REL_OP ( C_TYPE op2, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<W2,W2,S2>(op2).operator REL_OP (op);  \
-  }
-
-#define FX_ASSIGN_OP_WITH_INT_2(ASSIGN_OP, C_TYPE, W2, S2)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
-    return op.operator ASSIGN_OP (ac_fixed<W2,W2,S2>(op2));  \
-  }
-
-#define FX_ASSIGN_OP_WITH_INT_2I(ASSIGN_OP, C_TYPE, W2, S2)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
-  inline ac_fixed<W,I,S> operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
-    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
-  }
-
-#define FX_OPS_WITH_INT(C_TYPE, WI, SI) \
-  FX_BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
-  FX_BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
-  FX_BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
-  FX_BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
-  FX_BIN_OP_WITH_INT_2I(>>, C_TYPE, WI, SI) \
-  FX_BIN_OP_WITH_INT_2I(<<, C_TYPE, WI, SI) \
-  FX_BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
-  FX_BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
-  FX_BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
-  \
-  FX_REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
-  FX_REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
-  \
-  FX_ASSIGN_OP_WITH_INT_2(+=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(-=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(*=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(/=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2I(>>=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2I(<<=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(&=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(|=, C_TYPE, WI, SI) \
-  FX_ASSIGN_OP_WITH_INT_2(^=, C_TYPE, WI, SI)
-
-// --------------------------------------- End of Macros for Binary Operators with C Integers
-
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-namespace ac {
-  namespace ops_with_other_types {
-#endif
-    // Binary Operators with C Integers --------------------------------------------
-    FX_OPS_WITH_INT(bool, 1, false)
-    FX_OPS_WITH_INT(char, 8, true)
-    FX_OPS_WITH_INT(signed char, 8, true)
-    FX_OPS_WITH_INT(unsigned char, 8, false)
-    FX_OPS_WITH_INT(short, 16, true)
-    FX_OPS_WITH_INT(unsigned short, 16, false)
-    FX_OPS_WITH_INT(int, 32, true)
-    FX_OPS_WITH_INT(unsigned int, 32, false)
-    FX_OPS_WITH_INT(long, ac_private::long_w, true)
-    FX_OPS_WITH_INT(unsigned long, ac_private::long_w, false)
-    FX_OPS_WITH_INT(Slong, 64, true)
-    FX_OPS_WITH_INT(Ulong, 64, false)
-    // -------------------------------------- End of Binary Operators with Integers
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-  }  // ops_with_other_types namespace
-} // ac namespace
-#endif
-
-
-// Macros for Binary Operators with ac_int --------------------------------------------
-
-#define FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
-  }
-
-#define FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &i_op) {  \
-    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
-  }
-
-#define FX_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
-  FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
-  FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
-
-#define FX_REL_OP_WITH_AC_INT(REL_OP)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
-    return op.operator REL_OP (ac_fixed<WI,WI,SI>(op2));  \
-  }  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const ac_fixed<W,I,S,Q,O> &op) {  \
-    return ac_fixed<WI,WI,SI>(op2).operator REL_OP (op);  \
-  }
-
-#define FX_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
-    return op.operator ASSIGN_OP (ac_fixed<WI,WI,SI>(op2));  \
-  }  \
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
-  inline ac_int<WI,SI> &operator ASSIGN_OP ( ac_int<WI,SI> &op, const ac_fixed<W,I,S,Q,O> &op2) {  \
-    return op.operator ASSIGN_OP (op2.to_ac_int());  \
-  }
-
-// -------------------------------------------- End of Macros for Binary Operators with ac_int
-
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-namespace ac {
-  namespace ops_with_other_types {
-#endif
-    // Binary Operators with ac_int --------------------------------------------
-    FX_BIN_OP_WITH_AC_INT(*, mult)
-    FX_BIN_OP_WITH_AC_INT(+, plus)
-    FX_BIN_OP_WITH_AC_INT(-, minus)
-    FX_BIN_OP_WITH_AC_INT(/, div)
-    FX_BIN_OP_WITH_AC_INT(&, logic)
-    FX_BIN_OP_WITH_AC_INT(|, logic)
-    FX_BIN_OP_WITH_AC_INT(^, logic)
-
-    FX_REL_OP_WITH_AC_INT(==)
-    FX_REL_OP_WITH_AC_INT(!=)
-    FX_REL_OP_WITH_AC_INT(>)
-    FX_REL_OP_WITH_AC_INT(>=)
-    FX_REL_OP_WITH_AC_INT(<)
-    FX_REL_OP_WITH_AC_INT(<=)
-
-    FX_ASSIGN_OP_WITH_AC_INT(+=)
-    FX_ASSIGN_OP_WITH_AC_INT(-=)
-    FX_ASSIGN_OP_WITH_AC_INT(*=)
-    FX_ASSIGN_OP_WITH_AC_INT(/=)
-    FX_ASSIGN_OP_WITH_AC_INT(&=)
-    FX_ASSIGN_OP_WITH_AC_INT(|=)
-    FX_ASSIGN_OP_WITH_AC_INT(^=)
-    // -------------------------------------- End of Binary Operators with ac_int
-
-    // Relational Operators with double --------------------------------------
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator == ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator == (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator != ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator != (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator > ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator < (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator < ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator > (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator <= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator >= (op);
-    }
-    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-    inline bool operator >= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
-      return op2.operator <= (op);
-    }
-    // -------------------------------------- End of Relational Operators with double
-#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
-  }  // ops_with_other_types namespace
-} // ac namespace
-using namespace ac::ops_with_other_types;
-#endif
-
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-
-// Global templatized functions for easy initialization to special values
-template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>) {
-  ac_fixed<W,I,S> r;
-  return r.template set_val<V>();
-}
-
-namespace ac {
-// PUBLIC FUNCTIONS
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  inline bool init_array(ac_fixed<W,I,S,Q,O> *a, int n) {
-    ac_fixed<W,I,S> t;
-    t.template set_val<V>();
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-
-  inline ac_fixed<54,2,true> frexp_d(double d, ac_int<11,true> &exp) {
-    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!d) {
-      exp = 0;
-      return 0;
-    }
-    int exp_i;
-    double f0 = frexp(d, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i > Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
-    exp = exp_i + rshift;
-    ac_int<Mant_W+2,true> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
-    ac_fixed<Mant_W+2,2,true> r;
-    r.set_slc(0, f_i);
-    return r;
-  }
-  inline ac_fixed<25,2,true> frexp_f(float f, ac_int<8,true> &exp) {
-    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!f) {
-      exp = 0;
-      return 0;
-    }
-    int exp_i;
-    float f0 = frexpf(f, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i >= Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
-    exp = exp_i + rshift;
-    ac_int<Mant_W+2,true> f_i = f0 * (1 << (Mant_W + 1 - rshift));
-    ac_fixed<Mant_W+2,2,true> r;
-    r.set_slc(0, f_i);
-    return r;
-  }
-
-  inline ac_fixed<53,1,false> frexp_sm_d(double d, ac_int<11,true> &exp, bool &sign) {
-    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!d) {
-      exp = 0;
-      sign = false;
-      return 0;
-    }
-    int exp_i;
-    bool s = d < 0;
-    double f0 = frexp(s ? -d : d, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
-    exp = exp_i + rshift;
-    ac_int<Mant_W+1,false> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
-    ac_fixed<Mant_W+1,1,false> r;
-    r.set_slc(0, f_i);
-    sign = s;
-    return r;
-  }
-  inline ac_fixed<24,1,false> frexp_sm_f(float f, ac_int<8,true> &exp, bool &sign) {
-    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
-    if(!f) {
-      exp = 0;
-      sign = false;
-      return 0;
-    }
-    int exp_i;
-    bool s = f < 0;
-    float f0 = frexp(s ? -f : f, &exp_i);
-    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
-    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
-    exp_i--;
-    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
-    exp = exp_i + rshift;
-    ac_int<24,false> f_i = f0 * (1 << (Mant_W + 1 - rshift));
-    ac_fixed<24,1,false> r;
-    r.set_slc(0, f_i);
-    sign = s;
-    return r;
-  }
-
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-  const ac_fixed<W,I,S,Q,O> &basic_num_ovf_base<W,I,S,Q,O>::value() const {
-    return (const ac_fixed<W,I,S,Q,O> &) *this;
-  }
-
-  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> std::string basic_num_ovf_base<W,I,S,Q,O>::type_name() {
-    return ac_fixed<W,I,S,Q,O>::type_name();
-  }
-}
-
-
-///////////////////////////////////////////////////////////////////////////////
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_FIXED_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2005-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_fixed.h
+//  Description:    class for fixed point operation handling in C++
+//  Author:         Andres Takach, Ph.D.
+*/
+
+#ifndef __AC_FIXED_H
+#define __AC_FIXED_H
+
+#include "ac_int.h"
+
+#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
+#error GCC version 3 or greater is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
+#error Microsoft Visual Studio 8 or newer is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4127 4308 4365 4514 4800 )
+#endif
+
+#ifndef __SYNTHESIS__
+#ifndef __AC_FIXED_UTILITY_BASE
+#define __AC_FIXED_UTILITY_BASE
+#endif
+
+#endif
+
+#ifdef __SYNTHESIS__
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+#undef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+#endif
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+namespace ac_private {
+  template<typename T>
+  struct rt_ac_fixed_T {
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::mult mult;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::plus plus;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus2 minus;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::minus minus2;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::logic logic;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div2 div;
+      typedef typename T::template rt_T< ac_fixed<W,I,S,AC_TRN,AC_WRAP> >::div div2;
+    };
+  };
+  // specializations after definition of ac_fixed
+}
+
+namespace ac {
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  class basic_num_ovf_base {
+    bool d_enable;
+  public:
+    basic_num_ovf_base() : d_enable(true) {}
+    void enable_ovf(bool a) { d_enable = a; }
+    bool is_enabled() const { return d_enable; }
+    template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+    void update(bool overflow, bool neg, const basic_num_ovf_base<W2,I2,S2,Q2,O2> &op2) {
+#ifndef __AC_OVERRIDE_OVF_UPDATE_BODY
+      if(d_enable) {
+        if(overflow) {
+          std::cerr << (neg ? "-" : "+") << "OVF: ";
+          std::cerr << type_name() << " ( " << basic_num_ovf_base<W2,I2,S2,Q2,O2>::type_name(); 
+          std::cerr << " ( " << op2.value().to_double() << " ) )" << std::endl;
+        }
+      }
+#else
+      __AC_OVERRIDE_OVF_UPDATE_BODY
+#endif
+    }
+    void update(bool overflow, bool neg, double op2) {
+#ifndef __AC_OVERRIDE_OVF_UPDATE2_BODY
+      if(d_enable) {
+        if(overflow) {
+          std::cerr << (neg ? "-" : "+") << "OVF: ";
+          std::cerr << type_name() << " ( " << "double"; 
+          std::cerr << " ( " << op2 << " ) )" << std::endl;
+        }
+      }
+#else
+      __AC_OVERRIDE_OVF_UPDATE2_BODY
+#endif
+    }
+    const ac_fixed<W,I,S,Q,O> &value() const;
+    static std::string type_name();
+  };
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//  ac_fixed
+//////////////////////////////////////////////////////////////////////////////
+
+//enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
+//enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
+
+template<int W, int I, bool S=true, ac_q_mode Q=AC_TRN, ac_o_mode O=AC_WRAP>
+class ac_fixed : private ac_private::iv<(W+31+!S)/32>
+#ifndef __SYNTHESIS__
+__AC_FIXED_UTILITY_BASE
+#endif
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+, public __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+#endif
+{
+#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+#pragma builtin
+#endif
+
+  enum {N=(W+31+!S)/32};
+
+  template<int W2>
+  struct rt_priv {
+    enum {w_shiftl = AC_MAX(W+W2,1) };
+    typedef ac_fixed<w_shiftl, I, S> shiftl;
+  };
+
+  typedef ac_private::iv<N> Base;
+
+  inline void bit_adjust() {
+    const unsigned rem = (32-W)&31;
+    Base::v[N-1] =  S ? ((signed) ((unsigned)Base::v[N-1]  << rem) >> rem) : (rem ?
+                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
+  }
+  inline Base &base() { return *this; }
+  inline const Base &base() const { return *this; }
+
+  inline void overflow_adjust(bool overflow, bool neg) {
+    if(O==AC_WRAP) {
+      bit_adjust();
+      return;
+    }
+    else if(O==AC_SAT_ZERO) {
+      if(overflow)
+        ac_private::iv_extend<N>(Base::v, 0);
+      else
+        bit_adjust();
+    }
+    else if(S) {
+      if(overflow) {
+        if(!neg) {
+          ac_private::iv_extend<N-1>(Base::v, ~0);
+          Base::v[N-1] = ~((unsigned)~0 << ((W-1)&31));
+        } else {
+          ac_private::iv_extend<N-1>(Base::v, 0);
+          Base::v[N-1] = ((unsigned)~0 << ((W-1)&31));
+          if(O==AC_SAT_SYM)
+            Base::v[0] |= 1;
+        }
+      } else
+        bit_adjust();
+    }
+    else {
+      if(overflow) {
+        if(!neg) {
+          ac_private::iv_extend<N-1>(Base::v, ~0);
+          Base::v[N-1] = ~((unsigned)~0 << (W&31));
+        } else
+          ac_private::iv_extend<N>(Base::v, 0);
+      } else
+        bit_adjust();
+    }
+  }
+
+  inline bool quantization_adjust(bool qb, bool r, bool s) {
+    if(Q==AC_TRN)
+      return false;
+    if(Q==AC_RND_ZERO)
+      qb &= s || r;
+    else if(Q==AC_RND_MIN_INF)
+      qb &= r;
+    else if(Q==AC_RND_INF)
+      qb &= !s || r;
+    else if(Q==AC_RND_CONV)
+      qb &= (Base::v[0] & 1) || r;
+    else if(Q==AC_RND_CONV_ODD)
+      qb &= (!(Base::v[0] & 1)) || r;
+    else if(Q==AC_TRN_ZERO)
+      qb = s && ( qb || r );
+    return ac_private::iv_uadd_carry<N>(Base::v, qb, Base::v);
+  }
+
+  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
+
+public:
+  static const int width = W;
+  static const int i_width = I;
+  static const bool sign = S;
+  static const ac_o_mode o_mode = O;
+  static const ac_q_mode q_mode = Q;
+  static const int e_width = 0;
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+  static const bool compute_overflow_for_wrap = true;
+#else
+  static const bool compute_overflow_for_wrap = false;
+#endif
+
+  template<int W2, int I2, bool S2>
+  struct rt {
+    enum {
+      F=W-I,
+      F2=W2-I2,
+      mult_w = W+W2,
+      mult_i = I+I2,
+      mult_s = S||S2,
+      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      plus_s = S||S2,
+      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      minus_s = true,
+      div_w = W+AC_MAX(W2-I2,0)+S2,
+      div_i = I+(W2-I2)+S2,
+      div_s = S||S2,
+      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
+      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
+      logic_s = S||S2
+    };
+    typedef ac_fixed<mult_w, mult_i, mult_s> mult;
+    typedef ac_fixed<plus_w, plus_i, plus_s> plus;
+    typedef ac_fixed<minus_w, minus_i, minus_s> minus;
+    typedef ac_fixed<logic_w, logic_i, logic_s> logic;
+    typedef ac_fixed<div_w, div_i, div_s> div;
+    typedef ac_fixed<W, I, S> arg1;
+  };
+
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::mult mult;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::plus plus;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus minus;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::minus2 minus2;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::logic logic;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div div;
+    typedef typename ac_private::rt_ac_fixed_T<map_T>::template op1<W,I,S>::div2 div2;
+    typedef ac_fixed<W, I, S> arg1;
+  };
+
+  struct rt_unary {
+    enum {
+      neg_w = W+1,
+      neg_i = I+1,
+      neg_s = true,
+      mag_sqr_w = 2*W-S,
+      mag_sqr_i = 2*I-S,
+      mag_sqr_s = false,
+      mag_w = W+S,
+      mag_i = I+S,
+      mag_s = false,
+      leading_sign_w = ac::log2_ceil<W+!S>::val,
+      leading_sign_s = false
+    };
+    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
+    typedef ac_fixed<neg_w, neg_i, neg_s> neg;
+    typedef ac_fixed<mag_sqr_w, mag_sqr_i, mag_sqr_s> mag_sqr;
+    typedef ac_fixed<mag_w, mag_i, mag_s> mag;
+    template<unsigned N>
+    struct set {
+      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_s = S};
+      typedef ac_fixed<sum_w, sum_i, sum_s> sum;
+    };
+  };
+
+  ac_fixed(const ac_fixed &op): Base(op) { }
+
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
+  ac_fixed() {
+#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
+    bit_adjust();
+    if( O==AC_SAT_SYM && S && Base::v[N-1] < 0 && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true) )
+      Base::v[0] |= 1;
+#endif
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  inline ac_fixed (const ac_fixed<W2,I2,S2,Q2,O2> &op) {
+    enum {N2=(W2+31+!S2)/32, F=W-I, F2=W2-I2, QUAN_INC = F2>F && !(Q==AC_TRN || (Q==AC_TRN_ZERO && !S2)) };
+    bool carry = false;
+    // handle quantization
+    if(F2 == F)
+      Base::operator =(op);
+    else if(F2 > F) {
+      op.template const_shift_r<N,F2-F>(*this);
+//      ac_private::iv_const_shift_r<N2,N,F2-F>(op.v, Base::v);
+      if(Q!=AC_TRN && !(Q==AC_TRN_ZERO && !S2)) {
+        bool qb = (F2-F > W2) ? (op.v[N2-1] < 0) : (bool) op[F2-F-1];
+        bool r = (F2 > F+1) ? !ac_private::iv_equal_zeros_to<F2-F-1,N2>(op.v) : false;
+        carry = quantization_adjust(qb, r, S2 && op.v[N2-1] < 0);
+      }
+    }
+    else  // no quantization
+      op.template const_shift_l<N,F-F2>(*this);
+//      ac_private::iv_const_shift_l<N2,N,F-F2>(op.v, Base::v);
+    // handle overflow
+    if((O!=AC_WRAP || compute_overflow_for_wrap)
+       && ((!S && S2) || I-S < I2-S2+(QUAN_INC || (S2 && O==AC_SAT_SYM && (O2 != AC_SAT_SYM || F2 > F) )))
+    ) { // saturation
+      bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
+      bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
+      bool neg_src;
+      if(F2-F+32*N < W2) {
+        bool all_ones = ac_private::iv_equal_ones_from<F2-F+32*N,N2>(op.v);
+        deleted_bits_zero = deleted_bits_zero && (carry ? all_ones : ac_private::iv_equal_zeros_from<F2-F+32*N,N2>(op.v));
+        deleted_bits_one = deleted_bits_one && (carry ? ac_private::iv_equal_ones_from<1+F2-F+32*N,N2>(op.v) && !op[F2-F+32*N] : all_ones);
+        neg_src = S2 && op.v[N2-1] < 0 && !(carry & all_ones);
+      }
+      else
+        neg_src = S2 && op.v[N2-1] < 0 && Base::v[N-1] < 0;
+      bool neg_trg = S && (bool) this->operator[](W-1);
+      bool overflow = !neg_src && (neg_trg || !deleted_bits_zero);
+      overflow |= neg_src && (!neg_trg || !deleted_bits_one);
+      if(O==AC_SAT_SYM && S && S2)
+        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
+      overflow_adjust(overflow, neg_src);
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+    __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,op);
+#endif
+    }
+    else
+      bit_adjust();
+  }
+
+  template<int W2, bool S2>
+  inline ac_fixed (const ac_int<W2,S2> &op) {
+    ac_fixed<W2,W2,S2> f_op;
+    f_op.base().operator =(op);
+    *this = f_op;
+  }
+
+  template<int W2>
+  typename rt_priv<W2>::shiftl shiftl() const {
+    typedef typename rt_priv<W2>::shiftl shiftl_t;
+    shiftl_t r;
+    Base::template const_shift_l<shiftl_t::N,W2>(r);
+    return r;
+  }
+
+  inline ac_fixed( bool b ) { *this = (ac_int<1,false>) b; }
+  inline ac_fixed( char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_fixed( signed char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_fixed( unsigned char b ) { *this = (ac_int<8,false>) b; }
+  inline ac_fixed( signed short b ) { *this = (ac_int<16,true>) b; }
+  inline ac_fixed( unsigned short b ) { *this = (ac_int<16,false>) b; }
+  inline ac_fixed( signed int b ) { *this = (ac_int<32,true>) b; }
+  inline ac_fixed( unsigned int b ) { *this = (ac_int<32,false>) b; }
+  inline ac_fixed( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
+  inline ac_fixed( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
+  inline ac_fixed( Slong b ) { *this = (ac_int<64,true>) b; }
+  inline ac_fixed( Ulong b ) { *this = (ac_int<64,false>) b; }
+
+  inline ac_fixed( double d ) {
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool o, qb, r;
+    bool neg_src = d < 0;
+    Base::conv_from_fraction(di, &qb, &r, &o);
+    quantization_adjust(qb, r, neg_src);
+    // a neg number may become non neg (0) after quantization
+    neg_src &= o || Base::v[N-1] < 0;
+
+    if(O!=AC_WRAP || compute_overflow_for_wrap) { // saturation
+      bool overflow;
+      bool neg_trg = S && (bool) this->operator[](W-1);
+      if(o) {
+        overflow = true;
+      } else {
+        bool deleted_bits_zero = !(W&31)&S || !(Base::v[N-1] >> (W&31));
+        bool deleted_bits_one = !(W&31)&S || !~(Base::v[N-1] >> (W&31));
+        overflow = !neg_src && (neg_trg || !deleted_bits_zero);
+        overflow |= neg_src && (!neg_trg || !deleted_bits_one);
+      }
+      if(O==AC_SAT_SYM && S)
+        overflow |= neg_src && (W > 1 ? ac_private::iv_equal_zeros_to<W-1,N>(Base::v) : true);
+      overflow_adjust(overflow, neg_src);
+#ifdef __AC_FIXED_NUMERICAL_ANALYSIS_BASE
+      __AC_FIXED_NUMERICAL_ANALYSIS_BASE::update(overflow,neg_src,d);
+#endif
+    } else
+      bit_adjust();
+  }
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+  template<ac_special_val V>
+  inline ac_fixed &set_val() {
+    if(V == AC_VAL_DC) {
+      ac_fixed r;
+      Base::operator =(r);
+      bit_adjust();
+    }
+    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+      Base::operator =(0);
+      if(S && V == AC_VAL_MIN) {
+        const unsigned rem = (W-1)&31;
+        Base::v[N-1] = ((unsigned)~0 << rem);
+        if(O == AC_SAT_SYM) {
+          if(W == 1)
+            Base::v[0] = 0;
+          else
+            Base::v[0] |= 1;
+        }
+      } else if(V == AC_VAL_QUANTUM)
+        Base::v[0] = 1;
+    }
+    else {  // AC_VAL_MAX
+      Base::operator =(-1);
+      const unsigned int rem = (32-W - (unsigned) !S )&31;
+      Base::v[N-1] = ((unsigned) (-1) >> 1) >> rem;
+    }
+    return *this;
+  }
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+  // Explicit conversion functions to ac_int that captures all integer bits (bits are truncated)
+  inline ac_int<AC_MAX(I,1),S> to_ac_int() const { return ((ac_fixed<AC_MAX(I,1),AC_MAX(I,1),S>) *this).template slc<AC_MAX(I,1)>(0); }
+
+  // Explicit conversion functions to C built-in types -------------
+  inline int to_int() const { return ((I-W) >= 32) ? 0 : (signed int) to_ac_int(); }
+  inline unsigned to_uint() const { return ((I-W) >= 32) ? 0 : (unsigned int) to_ac_int(); }
+  inline long to_long() const { return ((I-W) >= ac_private::long_w) ? 0 : (signed long) to_ac_int(); }
+  inline unsigned long to_ulong() const { return ((I-W) >= ac_private::long_w) ? 0 : (unsigned long) to_ac_int(); }
+  inline Slong to_int64() const { return ((I-W) >= 64) ? 0 : (Slong) to_ac_int(); }
+  inline Ulong to_uint64() const { return ((I-W) >= 64) ? 0 : (Ulong) to_ac_int(); }
+  inline double to_double() const { return ac_private::ldexpr<I-W>(Base::to_double()); }
+
+  inline int length() const { return W; }
+
+  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
+    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
+    char r[(W-AC_MIN(AC_MIN(W-I,I),0)+31)/32*32+5] = {0};
+    int i = 0;
+    if(sign_mag)
+      r[i++] = is_neg() ? '-' : '+';
+    else if (base_rep == AC_DEC && is_neg())
+      r[i++] = '-';
+    if(base_rep != AC_DEC) {
+      r[i++] = '0';
+      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
+    }
+    ac_fixed<W+1, I+1, true> t;
+    if( (base_rep == AC_DEC || sign_mag) && is_neg() )
+      t = operator -();
+    else
+      t = *this;
+    ac_fixed<AC_MAX(I+1,1),AC_MAX(I+1,1),true> i_part = t;
+    ac_fixed<AC_MAX(W-I,1),0,false> f_part = t;
+    i += ac_private::to_string(i_part.v, AC_MAX(I+1,1), sign_mag, base_rep, false, r+i);
+    if(W-I > 0) {
+      r[i++] = '.';
+      if(!ac_private::to_string(f_part.v, W-I, false, base_rep, true, r+i))
+        r[--i] = 0;
+    }
+    if(!i) {
+      r[0] = '0';
+      r[1] = 0;
+    }
+    return std::string(r);
+  }
+  inline static std::string type_name() {
+    const char *tf[] = {"false", "true" };
+    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV", "AC_RND_CONV_ODD" };
+    const char *o[] = {"AC_WRAP", "AC_SAT", "AC_SAT_ZERO", "AC_SAT_SYM" };
+    std::string r = "ac_fixed<";
+    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
+    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
+    r += tf[S];
+    r += ',';
+    r += q[Q];
+    r += ',';
+    r += o[O];
+    r += '>';
+    return r;
+  }
+
+  // Arithmetic : Binary ----------------------------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::mult operator *( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    typename rt<W2,I2,S2>::mult r;
+    Base::mult(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::plus operator +( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::plus r;
+    if(F == F2)
+      Base::add(op2, r);
+    else if(F > F2)
+      Base::add(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().add(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::minus operator -( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::minus r;
+    if(F == F2)
+      Base::sub(op2, r);
+    else if(F > F2)
+      Base::sub(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().sub(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wenum-compare"
+#endif
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::div operator /( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    typename rt<W2,I2,S2>::div r;
+    enum { Num_w = W+AC_MAX(W2-I2,0), Num_i = I, Num_w_minus = Num_w+S, Num_i_minus = Num_i+S,
+          N1 = ac_fixed<Num_w,Num_i,S>::N, N1minus = ac_fixed<Num_w_minus,Num_i_minus,S>::N,
+          N2 = ac_fixed<W2,I2,S2>::N, N2minus = ac_fixed<W2+S2,I2+S2,S2>::N,
+          num_s = S + (N1minus > N1), den_s = S2 + (N2minus > N2), Nr = rt<W2,I2,S2>::div::N };
+    ac_fixed<Num_w, Num_i, S> t = *this;
+    t.template div<num_s, N2, den_s, Nr>(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+  // Arithmetic assign  ------------------------------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator *=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator *(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator +=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator +(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator -=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator -(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator /=( const ac_fixed<W2,I2,S2,Q2,O2> &op2) {
+    *this = this->operator /(op2);
+    return *this;
+  }
+  // increment/decrement by quantum (smallest difference that can be represented)
+  // Arithmetic prefix increment, decrement ---------------------------------
+  ac_fixed &operator ++() {
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator += (q);
+    return *this;
+  }
+  ac_fixed &operator --() {
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator -= (q);
+    return *this;
+  }
+  // Arithmetic postfix increment, decrement ---------------------------------
+  const ac_fixed operator ++(int) {
+    ac_fixed t = *this;
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator += (q);
+    return t;
+  }
+  const ac_fixed operator --(int) {
+    ac_fixed t = *this;
+    ac_fixed<1,I-W+1,false> q;
+    q.template set_val<AC_VAL_QUANTUM>();
+    operator -= (q);
+    return t;
+  }
+  // Arithmetic Unary --------------------------------------------------------
+  ac_fixed operator +() {
+    return *this;
+  }
+  typename rt_unary::neg operator -() const {
+    typename rt_unary::neg r;
+    Base::neg(r);
+    r.bit_adjust();
+    return r;
+  }
+  // ! ------------------------------------------------------------------------
+  bool operator ! () const {
+    return Base::equal_zero();
+  }
+
+  // Bitwise (arithmetic) unary: complement  -----------------------------
+  ac_fixed<W+!S, I+!S, true> operator ~() const {
+    ac_fixed<W+!S, I+!S, true> r;
+    Base::bitwise_complement(r);
+    return r;
+  }
+  // Bitwise (not arithmetic) bit complement  -----------------------------
+  ac_fixed<W, I, false> bit_complement() const {
+    ac_fixed<W, I, false> r;
+    Base::bitwise_complement(r);
+    r.bit_adjust();
+    return r;
+  }
+  // Bitwise (not arithmetic): and, or, xor ----------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::logic operator &( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::logic r;
+    if(F == F2)
+      Base::bitwise_and(op2, r);
+    else if(F > F2)
+      Base::bitwise_and(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().bitwise_and(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::logic operator |( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::logic r;
+    if(F == F2)
+      Base::bitwise_or(op2, r);
+    else if(F > F2)
+      Base::bitwise_or(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().bitwise_or(op2, r);
+    return r;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  typename rt<W2,I2,S2>::logic operator ^( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    typename rt<W2,I2,S2>::logic r;
+    if(F == F2)
+      Base::bitwise_xor(op2, r);
+    else if(F > F2)
+      Base::bitwise_xor(op2.template shiftl<F-F2>(), r);
+    else
+      shiftl<F2-F>().bitwise_xor(op2, r);
+    return r;
+  }
+  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator &= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
+    *this = this->operator &(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator |= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
+    *this = this->operator |(op2);
+    return *this;
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  ac_fixed &operator ^= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2 ) {
+    *this = this->operator ^(op2);
+    return *this;
+  }
+  // Shift (result constrained by left operand) -------------------------------
+  template<int W2>
+  ac_fixed operator << ( const ac_int<W2,true> &op2 ) const {
+    // currently not written to overflow or quantize (neg shift)
+    ac_fixed r;
+    Base::shift_l2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_fixed operator << ( const ac_int<W2,false> &op2 ) const {
+    // currently not written to overflow
+    ac_fixed r;
+    Base::shift_l(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_fixed operator >> ( const ac_int<W2,true> &op2 ) const {
+    // currently not written to quantize or overflow (neg shift)
+    ac_fixed r;
+    Base::shift_r2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_fixed operator >> ( const ac_int<W2,false> &op2 ) const {
+    // currently not written to quantize
+    ac_fixed r;
+    Base::shift_r(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  // Shift assign ------------------------------------------------------------
+  template<int W2>
+  ac_fixed operator <<= ( const ac_int<W2,true> &op2 ) {
+    // currently not written to overflow or quantize (neg shift)
+    Base r;
+    Base::shift_l2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_fixed operator <<= ( const ac_int<W2,false> &op2 ) {
+    // currently not written to overflow
+    Base r;
+    Base::shift_l(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_fixed operator >>= ( const ac_int<W2,true> &op2 ) {
+    // currently not written to quantize or overflow (neg shift)
+    Base r;
+    Base::shift_r2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_fixed operator >>= ( const ac_int<W2,false> &op2 ) {
+    // currently not written to quantize
+    Base r;
+    Base::shift_r(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  // Relational ---------------------------------------------------------------
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator == ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return Base::equal(op2);
+    else if(F > F2)
+      return Base::equal(op2.template shiftl<F-F2>());
+    else
+      return shiftl<F2-F>().equal(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator != ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return ! Base::equal(op2);
+    else if(F > F2)
+      return ! Base::equal(op2.template shiftl<F-F2>());
+    else
+      return ! shiftl<F2-F>().equal(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator < ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return Base::less_than(op2);
+    else if(F > F2)
+      return Base::less_than(op2.template shiftl<F-F2>());
+    else
+      return shiftl<F2-F>().less_than(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator >= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return ! Base::less_than(op2);
+    else if(F > F2)
+      return ! Base::less_than(op2.template shiftl<F-F2>());
+    else
+      return ! shiftl<F2-F>().less_than(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator > ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return Base::greater_than(op2);
+    else if(F > F2)
+      return Base::greater_than(op2.template shiftl<F-F2>());
+    else
+      return shiftl<F2-F>().greater_than(op2);
+  }
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2>
+  bool operator <= ( const ac_fixed<W2,I2,S2,Q2,O2> &op2) const {
+    enum { F=W-I, F2=W2-I2 };
+    if(F == F2)
+      return ! Base::greater_than(op2);
+    else if(F > F2)
+      return ! Base::greater_than(op2.template shiftl<F-F2>());
+    else
+      return ! shiftl<F2-F>().greater_than(op2);
+  }
+  bool operator == ( double d) const {
+    if(is_neg() != (d < 0.0))
+      return false;
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool overflow, qb, r;
+    ac_fixed<W,I,S> t;
+    t.conv_from_fraction(di, &qb, &r, &overflow);
+    if(qb || r || overflow)
+      return false;
+    return operator == (t);
+  }
+  bool operator != ( double d) const {
+    return !operator == ( d );
+  }
+  bool operator < ( double d) const {
+    if(is_neg() != (d < 0.0))
+      return is_neg();
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool overflow, qb, r;
+    ac_fixed<W,I,S> t;
+    t.conv_from_fraction(di, &qb, &r, &overflow);
+    if(is_neg() && overflow)
+      return false;
+    return (!is_neg() && overflow) || ((qb || r) && operator <= (t)) || operator < (t);
+  }
+  bool operator >= ( double d) const {
+    return !operator < ( d );
+  }
+  bool operator > ( double d) const {
+    if(is_neg() != (d < 0.0))
+      return !is_neg();
+    double di = ac_private::ldexpr<-(I+!S+((32-W-!S)&31))>(d);
+    bool overflow, qb, r;
+    ac_fixed<W,I,S> t;
+    t.conv_from_fraction(di, &qb, &r, &overflow);
+    if(!is_neg() && overflow )
+      return false;
+    return (is_neg() && overflow) || operator > (t);
+  }
+  bool operator <= ( double d) const {
+    return !operator > ( d );
+  }
+
+  // Bit and Slice Select -----------------------------------------------------
+  template<int WS, int WX, bool SX>
+  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int WS>
+  inline const ac_int<WS,S> slc(signed index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int WS>
+  inline const ac_int<WS,S> slc(unsigned uindex) const {
+    ac_int<WS,S> r;
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int W2, bool S2, int WX, bool SX>
+  inline ac_fixed &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_fixed &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_fixed &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+
+  template<int Msb, int Lsb>
+  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
+    #if __cplusplus > 199711L
+    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
+    static_assert(Lsb >= 0, "LSB is negative");
+    static_assert(Msb < W, "MSB >= W");
+    #endif
+    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
+  }
+
+  class ac_bitref {
+# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+# pragma builtin
+# endif
+    ac_fixed &d_bv;
+    unsigned d_index;
+  public:
+    ac_bitref( ac_fixed *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
+    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
+
+    inline ac_bitref operator = ( int val ) {
+      // lsb of int (val&1) is written to bit
+      if(d_index < W) {
+        int *pval = &d_bv.v[d_index>>5];
+        *pval ^= (*pval ^ ((unsigned) val << (d_index&31) )) & 1 << (d_index&31);
+        d_bv.bit_adjust();   // in case sign bit was assigned
+      }
+      return *this;
+    }
+    template<int W2, bool S2>
+    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
+      return operator =(val.to_int());
+    }
+    inline ac_bitref operator = ( const ac_bitref &val ) {
+      return operator =((int) (bool) val);
+    }
+  };
+
+  ac_bitref operator [] ( unsigned int uindex) {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  ac_bitref operator [] ( int index) {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  template<int W2, bool S2>
+  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+
+  bool operator [] ( unsigned int uindex) const {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  bool operator [] ( int index) const {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  template<int W2, bool S2>
+  bool operator [] ( const ac_int<W2,S2> &index) const {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  typename rt_unary::leading_sign leading_sign() const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    return ls;
+  }
+  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    all_sign = (ls == W-S);
+    return ls;
+  }
+  // returns false if number is denormal
+  template<int WE, bool SE>
+  bool normalize(ac_int<WE,SE> &exp) {
+    ac_int<W,S> m = this->template slc<W>(0);
+    bool r = m.normalize(exp);
+    this->set_slc(0,m);
+    return r;
+  }
+  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
+  template<int WE, bool SE>
+  bool normalize_RME(ac_int<WE,SE> &exp) {
+    ac_int<W,S> m = this->template slc<W>(0);
+    bool r = m.normalize_RME(exp);
+    this->set_slc(0,m);
+    return r;
+  }
+  inline void bit_fill_hex(const char *str) {
+    // Zero Pads if str is too short, throws ms bits away if str is too long
+    // Asserts if anything other than 0-9a-fA-F is encountered
+    ac_int<W,S> x;
+    x.bit_fill_hex(str);
+    set_slc(0, x);
+  }
+  template<int N>
+  inline void bit_fill(const int (&ivec)[N], bool bigendian=true) {
+    // bit_fill from integer vector
+    //   if W > N*32, missing most significant bits are zeroed
+    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
+    //
+    // Example:
+    //   ac_fixed<80,40,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
+    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
+    ac_int<W,S> x;
+    x.bit_fill(ivec, bigendian);
+    set_slc(0, x);
+  }
+};
+
+namespace ac {
+  template<typename T>
+  struct ac_fixed_represent {
+    enum { t_w = ac_private::c_type_params<T>::W, t_i = t_w, t_s = ac_private::c_type_params<T>::S };
+    typedef ac_fixed<t_w,t_i,t_s> type;
+  };
+  template<> struct ac_fixed_represent<float> {};
+  template<> struct ac_fixed_represent<double> {};
+  template<int W, bool S>
+  struct ac_fixed_represent< ac_int<W,S> > {
+    typedef ac_fixed<W,W,S> type;
+  };
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  struct ac_fixed_represent< ac_fixed<W,I,S,Q,O> > {
+    typedef ac_fixed<W,I,S,Q,O> type;
+  };
+}
+
+namespace ac_private {
+  // with T == ac_fixed
+  template<int W2, int I2, bool S2>
+  struct rt_ac_fixed_T< ac_fixed<W2,I2,S2> > {
+    typedef ac_fixed<W2,I2,S2> fx2_t;
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef ac_fixed<W,I,S> fx_t;
+      typedef typename fx_t::template rt<W2,I2,S2>::mult mult;
+      typedef typename fx_t::template rt<W2,I2,S2>::plus plus;
+      typedef typename fx_t::template rt<W2,I2,S2>::minus minus;
+      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
+      typedef typename fx_t::template rt<W2,I2,S2>::logic logic;
+      typedef typename fx_t::template rt<W2,I2,S2>::div div;
+      typedef typename fx2_t::template rt<W,I,S>::div div2;
+    };
+  };
+  // with T == ac_int
+  template<int W2, bool S2>
+  struct rt_ac_fixed_T< ac_int<W2,S2> > {
+    typedef ac_fixed<W2,W2,S2> fx2_t;
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef ac_fixed<W,I,S> fx_t;
+      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
+      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
+      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
+      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
+      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
+      typedef typename fx_t::template rt<W2,W2,S2>::div div;
+      typedef typename fx2_t::template rt<W,I,S>::div div2;
+    };
+  };
+
+  template<typename T>
+  struct rt_ac_fixed_T< c_type<T> > {
+    typedef typename ac::ac_fixed_represent<T>::type fx2_t;
+    enum { W2 = fx2_t::width, I2 = W2, S2 = fx2_t::sign };
+    template<int W, int I, bool S>
+    struct op1 {
+      typedef ac_fixed<W,I,S> fx_t;
+      typedef typename fx_t::template rt<W2,W2,S2>::mult mult;
+      typedef typename fx_t::template rt<W2,W2,S2>::plus plus;
+      typedef typename fx_t::template rt<W2,W2,S2>::minus minus;
+      typedef typename fx2_t::template rt<W,I,S>::minus minus2;
+      typedef typename fx_t::template rt<W2,W2,S2>::logic logic;
+      typedef typename fx_t::template rt<W2,W2,S2>::div div;
+      typedef typename fx2_t::template rt<W,I,S>::div div2;
+    };
+  };
+}
+
+
+// Specializations for constructors on integers that bypass bit adjusting
+//  and are therefore more efficient
+template<> inline ac_fixed<1,1,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b ? -1 : 0; }
+
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( signed long b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned long b ) { v[0] = b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b&1; }
+template<> inline ac_fixed<1,1,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b&1; }
+
+template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
+template<> inline ac_fixed<8,8,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = (signed char) b; }
+template<> inline ac_fixed<8,8,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned char) b; }
+
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( bool b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned char b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed char b ) { v[0] = (unsigned short) b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = b; }
+template<> inline ac_fixed<16,16,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned short b ) { v[0] = (signed short) b; }
+template<> inline ac_fixed<16,16,false,AC_TRN,AC_WRAP>::ac_fixed( signed short b ) { v[0] = (unsigned short) b; }
+
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; }
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; }
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( signed int b ) { v[0] = b; v[1] = 0;}
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( unsigned int b ) { v[0] = b; v[1] = 0;}
+
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; }
+template<> inline ac_fixed<32,32,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; }
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = 0;}
+template<> inline ac_fixed<32,32,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = 0;}
+
+template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
+template<> inline ac_fixed<64,64,true,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
+template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
+template<> inline ac_fixed<64,64,false,AC_TRN,AC_WRAP>::ac_fixed( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
+
+
+// Stream --------------------------------------------------------------------
+
+template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline std::ostream& operator << (std::ostream &os, const ac_fixed<W,I,S,Q,O> &x) {
+#ifndef __SYNTHESIS__
+  if ((os.flags() & std::ios::hex) != 0) {
+    os << x.to_string(AC_HEX);
+  } else if ((os.flags() & std::ios::oct) != 0) {
+    os << x.to_string(AC_OCT);
+  } else {
+    os << x.to_string(AC_DEC);
+  }
+#endif
+  return os;
+}
+
+
+// Macros for Binary Operators with C Integers --------------------------------------------
+
+#define FX_BIN_OP_WITH_INT_2I(BIN_OP, C_TYPE, WI, SI)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline ac_fixed<W,I,S,Q,O> operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
+    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
+  }
+
+#define FX_BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
+  } \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE i_op) {  \
+    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
+  }
+
+#define FX_REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
+    return op.operator REL_OP (ac_fixed<W2,W2,S2>(op2));  \
+  }  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline bool operator REL_OP ( C_TYPE op2, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<W2,W2,S2>(op2).operator REL_OP (op);  \
+  }
+
+#define FX_ASSIGN_OP_WITH_INT_2(ASSIGN_OP, C_TYPE, W2, S2)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
+    return op.operator ASSIGN_OP (ac_fixed<W2,W2,S2>(op2));  \
+  }
+
+#define FX_ASSIGN_OP_WITH_INT_2I(ASSIGN_OP, C_TYPE, W2, S2)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> \
+  inline ac_fixed<W,I,S> operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, C_TYPE op2) {  \
+    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
+  }
+
+#define FX_OPS_WITH_INT(C_TYPE, WI, SI) \
+  FX_BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
+  FX_BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
+  FX_BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
+  FX_BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
+  FX_BIN_OP_WITH_INT_2I(>>, C_TYPE, WI, SI) \
+  FX_BIN_OP_WITH_INT_2I(<<, C_TYPE, WI, SI) \
+  FX_BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
+  FX_BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
+  FX_BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
+  \
+  FX_REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
+  FX_REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
+  \
+  FX_ASSIGN_OP_WITH_INT_2(+=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(-=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(*=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(/=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2I(>>=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2I(<<=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(&=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(|=, C_TYPE, WI, SI) \
+  FX_ASSIGN_OP_WITH_INT_2(^=, C_TYPE, WI, SI)
+
+// --------------------------------------- End of Macros for Binary Operators with C Integers
+
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+namespace ac {
+  namespace ops_with_other_types {
+#endif
+    // Binary Operators with C Integers --------------------------------------------
+    FX_OPS_WITH_INT(bool, 1, false)
+    FX_OPS_WITH_INT(char, 8, true)
+    FX_OPS_WITH_INT(signed char, 8, true)
+    FX_OPS_WITH_INT(unsigned char, 8, false)
+    FX_OPS_WITH_INT(short, 16, true)
+    FX_OPS_WITH_INT(unsigned short, 16, false)
+    FX_OPS_WITH_INT(int, 32, true)
+    FX_OPS_WITH_INT(unsigned int, 32, false)
+    FX_OPS_WITH_INT(long, ac_private::long_w, true)
+    FX_OPS_WITH_INT(unsigned long, ac_private::long_w, false)
+    FX_OPS_WITH_INT(Slong, 64, true)
+    FX_OPS_WITH_INT(Ulong, 64, false)
+    // -------------------------------------- End of Binary Operators with Integers
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+  }  // ops_with_other_types namespace
+} // ac namespace
+#endif
+
+
+// Macros for Binary Operators with ac_int --------------------------------------------
+
+#define FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline typename ac_fixed<WI,WI,SI>::template rt<W,I,S>::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<WI,WI,SI>(i_op).operator BIN_OP (op);  \
+  }
+
+#define FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline typename ac_fixed<W,I,S>::template rt<WI,WI,SI>::RTYPE operator BIN_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &i_op) {  \
+    return op.operator BIN_OP (ac_fixed<WI,WI,SI>(i_op));  \
+  }
+
+#define FX_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
+  FX_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
+  FX_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
+
+#define FX_REL_OP_WITH_AC_INT(REL_OP)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline bool operator REL_OP ( const ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
+    return op.operator REL_OP (ac_fixed<WI,WI,SI>(op2));  \
+  }  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const ac_fixed<W,I,S,Q,O> &op) {  \
+    return ac_fixed<WI,WI,SI>(op2).operator REL_OP (op);  \
+  }
+
+#define FX_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline ac_fixed<W,I,S,Q,O> &operator ASSIGN_OP ( ac_fixed<W,I,S,Q,O> &op, const ac_int<WI,SI> &op2) {  \
+    return op.operator ASSIGN_OP (ac_fixed<WI,WI,SI>(op2));  \
+  }  \
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O, int WI, bool SI> \
+  inline ac_int<WI,SI> &operator ASSIGN_OP ( ac_int<WI,SI> &op, const ac_fixed<W,I,S,Q,O> &op2) {  \
+    return op.operator ASSIGN_OP (op2.to_ac_int());  \
+  }
+
+// -------------------------------------------- End of Macros for Binary Operators with ac_int
+
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+namespace ac {
+  namespace ops_with_other_types {
+#endif
+    // Binary Operators with ac_int --------------------------------------------
+    FX_BIN_OP_WITH_AC_INT(*, mult)
+    FX_BIN_OP_WITH_AC_INT(+, plus)
+    FX_BIN_OP_WITH_AC_INT(-, minus)
+    FX_BIN_OP_WITH_AC_INT(/, div)
+    FX_BIN_OP_WITH_AC_INT(&, logic)
+    FX_BIN_OP_WITH_AC_INT(|, logic)
+    FX_BIN_OP_WITH_AC_INT(^, logic)
+
+    FX_REL_OP_WITH_AC_INT(==)
+    FX_REL_OP_WITH_AC_INT(!=)
+    FX_REL_OP_WITH_AC_INT(>)
+    FX_REL_OP_WITH_AC_INT(>=)
+    FX_REL_OP_WITH_AC_INT(<)
+    FX_REL_OP_WITH_AC_INT(<=)
+
+    FX_ASSIGN_OP_WITH_AC_INT(+=)
+    FX_ASSIGN_OP_WITH_AC_INT(-=)
+    FX_ASSIGN_OP_WITH_AC_INT(*=)
+    FX_ASSIGN_OP_WITH_AC_INT(/=)
+    FX_ASSIGN_OP_WITH_AC_INT(&=)
+    FX_ASSIGN_OP_WITH_AC_INT(|=)
+    FX_ASSIGN_OP_WITH_AC_INT(^=)
+    // -------------------------------------- End of Binary Operators with ac_int
+
+    // Relational Operators with double --------------------------------------
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator == ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator == (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator != ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator != (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator > ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator < (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator < ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator > (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator <= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator >= (op);
+    }
+    template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+    inline bool operator >= ( double op, const ac_fixed<W,I,S,Q,O> &op2) {
+      return op2.operator <= (op);
+    }
+    // -------------------------------------- End of Relational Operators with double
+#ifdef AC_FIXED_NS_FOR_MIXED_OPERATORS
+  }  // ops_with_other_types namespace
+} // ac namespace
+using namespace ac::ops_with_other_types;
+#endif
+
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+
+// Global templatized functions for easy initialization to special values
+template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>) {
+  ac_fixed<W,I,S> r;
+  return r.template set_val<V>();
+}
+
+namespace ac {
+// PUBLIC FUNCTIONS
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  inline bool init_array(ac_fixed<W,I,S,Q,O> *a, int n) {
+    ac_fixed<W,I,S> t;
+    t.template set_val<V>();
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+
+  inline ac_fixed<54,2,true> frexp_d(double d, ac_int<11,true> &exp) {
+    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!d) {
+      exp = 0;
+      return 0;
+    }
+    int exp_i;
+    double f0 = frexp(d, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i > Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
+    exp = exp_i + rshift;
+    ac_int<Mant_W+2,true> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
+    ac_fixed<Mant_W+2,2,true> r;
+    r.set_slc(0, f_i);
+    return r;
+  }
+  inline ac_fixed<25,2,true> frexp_f(float f, ac_int<8,true> &exp) {
+    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!f) {
+      exp = 0;
+      return 0;
+    }
+    int exp_i;
+    float f0 = frexpf(f, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : (exp_i >= Min_Exp && f0 < 0 && f0 >= -0.5) ? -1 : 0;
+    exp = exp_i + rshift;
+    ac_int<Mant_W+2,true> f_i = f0 * (1 << (Mant_W + 1 - rshift));
+    ac_fixed<Mant_W+2,2,true> r;
+    r.set_slc(0, f_i);
+    return r;
+  }
+
+  inline ac_fixed<53,1,false> frexp_sm_d(double d, ac_int<11,true> &exp, bool &sign) {
+    enum {Min_Exp = -1022, Max_Exp = 1023, Mant_W = 52, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!d) {
+      exp = 0;
+      sign = false;
+      return 0;
+    }
+    int exp_i;
+    bool s = d < 0;
+    double f0 = frexp(s ? -d : d, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard double-precision float exponent max (+1024). It is probably an extended double");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard double-precision float exponent min (-1021). It is probably an extended double");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
+    exp = exp_i + rshift;
+    ac_int<Mant_W+1,false> f_i = f0 * ((Ulong) 1 << (Mant_W + 1 -rshift));
+    ac_fixed<Mant_W+1,1,false> r;
+    r.set_slc(0, f_i);
+    sign = s;
+    return r;
+  }
+  inline ac_fixed<24,1,false> frexp_sm_f(float f, ac_int<8,true> &exp, bool &sign) {
+    enum {Min_Exp = -126, Max_Exp = 127, Mant_W = 23, Denorm_Min_Exp = Min_Exp - Mant_W};
+    if(!f) {
+      exp = 0;
+      sign = false;
+      return 0;
+    }
+    int exp_i;
+    bool s = f < 0;
+    float f0 = frexp(s ? -f : f, &exp_i);
+    AC_ASSERT(exp_i <= Max_Exp+1, "Exponent greater than standard single-precision float exponent max (+128). It is probably an extended float");
+    AC_ASSERT(exp_i >= Denorm_Min_Exp+1, "Exponent less than standard single-precision float exponent min (-125). It is probably an extended float");
+    exp_i--;
+    int rshift = exp_i < Min_Exp ? Min_Exp - exp_i : 0;
+    exp = exp_i + rshift;
+    ac_int<24,false> f_i = f0 * (1 << (Mant_W + 1 - rshift));
+    ac_fixed<24,1,false> r;
+    r.set_slc(0, f_i);
+    sign = s;
+    return r;
+  }
+
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+  const ac_fixed<W,I,S,Q,O> &basic_num_ovf_base<W,I,S,Q,O>::value() const {
+    return (const ac_fixed<W,I,S,Q,O> &) *this;
+  }
+
+  template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O> std::string basic_num_ovf_base<W,I,S,Q,O>::type_name() {
+    return ac_fixed<W,I,S,Q,O>::type_name();
+  }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_FIXED_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_float.h b/hls4ml/templates/quartus/ac_types/ac_float.h
index 9229b54702..6174528d73 100644
--- a/hls4ml/templates/quartus/ac_types/ac_float.h
+++ b/hls4ml/templates/quartus/ac_types/ac_float.h
@@ -1,1196 +1,1196 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2013-2019, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-//  Source:         ac_float.h
-//  Description:    class for floating point operation handling in C++
-//  Author:         Andres Takach, Ph.D.
-
-#ifndef __AC_FLOAT_H
-#define __AC_FLOAT_H
-
-#include <ac_fixed.h>
-
-#ifndef __SYNTHESIS__
-#include <cmath>
-#endif
-
-#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
-#error GCC version 3 or greater is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
-#error Microsoft Visual Studio 8 or newer is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4003 4127 4308 4365 4514 4800 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wparentheses"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wparentheses"
-#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
-#pragma clang diagnostic ignored "-Wbitwise-op-parentheses"
-#endif
-
-// for safety
-#if (defined(E) || defined(WF) || defined(IF) || defined(SF))
-#error One or more of the following is defined: E, WF, IF, SF. Definition conflicts with their usage as template parameters.
-#error DO NOT use defines before including third party header files.
-#endif
-
-#define AC_FL(v) ac_float<W##v,I##v,E##v,Q##v>
-#define AC_FL0(v) ac_float<W##v,I##v,E##v>
-#define AC_FL_T(v) int W##v, int I##v, int E##v, ac_q_mode Q##v
-#define AC_FL_TV(v) W##v, I##v, E##v, Q##v
-#define AC_FL_T0(v) int W##v, int I##v, int E##v
-#define AC_FL_TV0(v) W##v, I##v, E##v
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-template<int W, int I, int E, ac_q_mode Q=AC_TRN> class ac_float;
-
-namespace ac_private {
-
-  typedef ac_float<54,2,11> ac_float_cdouble_t;
-  typedef ac_float<25,2,8> ac_float_cfloat_t;
-
-  template<typename T>
-  struct rt_ac_float_T {
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename T::template rt_T<fl_t>::mult mult;
-      typedef typename T::template rt_T<fl_t>::plus plus;
-      typedef typename T::template rt_T<fl_t>::minus2 minus;
-      typedef typename T::template rt_T<fl_t>::minus minus2;
-      typedef typename T::template rt_T<fl_t>::logic logic;
-      typedef typename T::template rt_T<fl_t>::div2 div;
-      typedef typename T::template rt_T<fl_t>::div div2;
-    };
-  };
-  // specializations after definition of ac_float
-
-  inline ac_float_cdouble_t double_to_ac_float(double d);
-  inline ac_float_cfloat_t float_to_ac_float(float f);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//  ac_float
-//////////////////////////////////////////////////////////////////////////////
-
-template< AC_FL_T() >
-class ac_float {
-  enum { NO_UN = true, S = true, S2 = true, SR = true };
-public:
-  typedef ac_fixed<W,I,S> mant_t;
-  typedef ac_int<E,true> exp_t;
-  mant_t m;
-  exp_t e;
-
-  void set_mantissa(const ac_fixed<W,I,S> &man) { m = man; }
-  void set_exp(const ac_int<E,true> &exp) { if(E) e = exp; }
-
-private:
-  inline bool is_neg() const { return m < 0; }   // is_neg would be more efficient
-
-  enum {NZ_E = !!E, MIN_EXP = -(NZ_E << (E-NZ_E)), MAX_EXP = (1 << (E-NZ_E))-1};
-
-public:
-  static const int width = W;
-  static const int i_width = I;
-  static const int e_width = E;
-  static const bool sign = S;
-  static const ac_q_mode q_mode = Q;
-  static const ac_o_mode o_mode = AC_SAT;
-
-  template< AC_FL_T0(2) >
-  struct rt {
-    enum {
-      // need to validate
-      F=W-I,
-      F2=W2-I2,
-      mult_w = W+W2,
-      mult_i = I+I2,
-      mult_e = AC_MAX(E,E2)+1,
-      mult_s = S||S2,
-      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      plus_e = AC_MAX(E,E2),
-      plus_s = S||S2,
-      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
-      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
-      minus_e = AC_MAX(E,E2),
-      minus_s = true,
-      div_w = W+AC_MAX(W2-I2,0)+S2,
-      div_i = I+(W2-I2)+S2,
-      div_e = AC_MAX(E,E2)+1,
-      div_s = S||S2,
-      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
-      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
-      logic_s = S||S2,
-      logic_e = AC_MAX(E,E2)
-    };
-    typedef ac_float<mult_w, mult_i, mult_e> mult;
-    typedef ac_float<plus_w, plus_i, plus_e> plus;
-    typedef ac_float<minus_w, minus_i, minus_e> minus;
-    typedef ac_float<logic_w, logic_i, logic_e> logic;
-    typedef ac_float<div_w, div_i, div_e> div;
-    typedef ac_float arg1;
-
-  };
-
-  template<int WI, bool SI>
-  struct rt_i {
-    enum {
-      lshift_w = W,
-      lshift_i = I,
-      lshift_s = S,
-      lshift_e_0 = exp_t::template rt<WI,SI>::plus::width,
-      lshift_e = AC_MIN(lshift_e_0, 24),
-      rshift_w = W,
-      rshift_i = I,
-      rshift_s = S,
-      rshift_e_0 = exp_t::template rt<WI,SI>::minus::width,
-      rshift_e = AC_MIN(rshift_e_0, 24)
-    };
-    typedef ac_float<lshift_w, lshift_i, lshift_e> lshift;
-    typedef ac_float<rshift_w, rshift_i, rshift_e> rshift;
-  };
-
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus2;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div2;
-    typedef ac_float arg1;
-  };
-
-  template<typename T>
-  struct rt_T2 {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus2;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div;
-    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div2;
-    typedef ac_float arg1;
-  };
-
-  struct rt_unary {
-    enum {
-      neg_w = W+1,
-      neg_i = I+1,
-      neg_e = E,
-      neg_s = true,
-      mag_sqr_w = 2*W-S + NO_UN,
-      mag_sqr_i = 2*I-S + NO_UN,
-      mag_sqr_e = E,
-      mag_sqr_s = false | NO_UN,
-      mag_w = W+S + NO_UN,
-      mag_i = I+S + NO_UN,
-      mag_e = E,
-      mag_s = false | NO_UN,
-      to_fx_i = I + MAX_EXP,
-      to_fx_w = W + MAX_EXP - MIN_EXP,
-      to_fx_s = S,
-      to_i_w = AC_MAX(to_fx_i,1),
-      to_i_s = S
-    };
-    typedef ac_float<neg_w, neg_i, neg_e> neg;
-    typedef ac_float<mag_sqr_w, mag_sqr_i, mag_sqr_e> mag_sqr;
-    typedef ac_float<mag_w, mag_i, mag_e> mag;
-    template<unsigned N>
-    struct set {
-      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_e = E, sum_s = S};
-      typedef ac_float<sum_w, sum_i, sum_e> sum;
-    };
-    typedef ac_fixed<to_fx_w, to_fx_i, to_fx_s> to_ac_fixed_t;
-    typedef ac_int<to_i_w, to_i_s> to_ac_int_t;
-  };
-
-  template<AC_FL_T(2)> friend class ac_float;
-
-  ac_float() {
-#if defined(AC_DEFAULT_IN_RANGE)
-#endif
-  }
-  ac_float(const ac_float &op) {
-    m = op.m;
-    e = op.e;
-  }
-
-private:
-  template<int W2>
-  bool round(const ac_fixed<W2,I,true> &op2, bool assert_on_rounding=false) {
-    const bool rnd = Q!=AC_TRN && Q!=AC_TRN_ZERO && W2 > W;
-    bool rnd_ovfl = false;
-    m = 0;
-    if(rnd) {
-      ac_fixed<W+1,I+1,true,Q> m_1 = op2;
-      // overflow because of rounding would lead to go from 001111  to 01000 (extra bit prevents it)
-      //   change from 01000 to 00100 and store 0100 in m
-      rnd_ovfl = !m_1[W] & m_1[W-1];
-      m_1[W-1] = m_1[W-1] & !rnd_ovfl;
-      m_1[W-2] = m_1[W-2] | rnd_ovfl;
-      m.set_slc(0, m_1.template slc<W>(0));
-      if(assert_on_rounding)
-        AC_ASSERT(m == op2, "Loss of precision due to Rounding");
-      return rnd_ovfl;
-    } else {
-      ac_fixed<W,I,true,Q> m_0 = op2;
-      m.set_slc(0, m_0.template slc<W>(0));
-      return false;
-    }
-  }
-
-  template<int min_exp2, int max_exp2, int W2, int I2, ac_q_mode Q2, ac_o_mode O2>
-  void assign_from(const ac_fixed<W2,I2,true,Q2,O2> &m2, int e2, bool sticky_bit, bool normalize, bool assert_on_rounding=false) {
-    const bool rnd = Q!=AC_TRN & Q!=AC_TRN_ZERO & W2 > W;
-    const bool need_rnd_bit = Q != AC_TRN;
-    const bool need_rem_bits = need_rnd_bit && Q != AC_RND;
-
-    const int msb_min_power = I-1 + MIN_EXP;
-    const int msb_min_power2 = I2-1 + min_exp2;
-    const int msb_min_power_dif = msb_min_power - msb_min_power2;
-    //   if > 0: target has additional negative exponent range
-    //     subnormal maybe be further normalized (done even if normalize==false)
-    //   if < 0: target has less negative exponent range
-    //     mantissa may need to be shifted right
-    //   in either case if source is unnormalized
-    //     normalization could take place
-
-    const int msb_max_power = I-1 + MAX_EXP;
-    const int msb_max_power2 = I2-1 + max_exp2 + rnd;
-    const int msb_max_power_dif = msb_max_power - msb_max_power2;
-
-    const bool may_shift_right = msb_min_power_dif > 0;
-    const int max_right_shift = may_shift_right ? msb_min_power_dif : 0;
-    const int t_width = W2 + (W >= W2 ? AC_MIN(W-W2+may_shift_right, max_right_shift) : 0);
-
-    int e_t = e2;
-    e_t += I2-I;
-    typedef ac_fixed<t_width,I2,true,Q2,O2> op2_t;
-    op2_t op2 = m2;
-    int ls = 0;
-    bool r_zero;
-    if(normalize) {
-      bool all_sign;
-      ls = m2.leading_sign(all_sign);
-      r_zero = all_sign & !m2[0];
-    } else if(msb_min_power_dif < 0 || msb_max_power_dif < 0 || W2 > W) {
-      // msb_min_power_dif < 0: src exponent less negative than trg exp represents
-      //   oportunity to further normalize value in trg representation
-      // msb_max_power_dif < 0: max target exp is less than max src exp
-      //   if un-normalized exp may overflow resulting in incorrect saturation
-      //     normalization is needed for correctness
-      // W2 > W
-      //   if un-normalized, extra bits may be incorrectly quantized away
-      const int msb_range_dif = AC_MAX(-msb_min_power_dif, -msb_max_power_dif);
-      const int msb_range_dif_norm_w = AC_MIN(msb_range_dif,W2-1);
-      const int extra_bits = AC_MAX(W2-W,0);
-      const int norm_w = AC_MAX(msb_range_dif_norm_w, extra_bits) + 1;
-      bool all_sign;
-      ls = m2.template slc<norm_w>(W2-norm_w).leading_sign(all_sign);
-      r_zero = all_sign & !m2[W2-1] & !(m2 << norm_w);
-    } else {
-      r_zero = !m2;
-    }
-    int actual_max_shift_left = (1 << (E-1)) + e_t;
-    if(may_shift_right && actual_max_shift_left < 0) {
-      const int shift_r_w = ac::nbits<max_right_shift>::val;
-      ac_int<shift_r_w,false> shift_r = -actual_max_shift_left;
-      if((1 << (E-1)) + min_exp2 + I2-I < 0 && need_rem_bits) {
-        op2_t shifted_out_bits = op2;
-        shifted_out_bits &= ~((~op2_t(0)) << shift_r);
-        sticky_bit |= !!shifted_out_bits;
-      }
-      op2 >>= shift_r;
-      e_t += shift_r;
-    } else {
-      bool shift_exponent_limited = ls >= actual_max_shift_left;
-      int shift_l = shift_exponent_limited ? actual_max_shift_left : (int) ls;
-      op2 <<= shift_l;
-      e_t = shift_exponent_limited ? MIN_EXP : e_t - ls;
-    }
-    ac_fixed<t_width+need_rem_bits,I,true> r_pre_rnd = 0;
-    r_pre_rnd.set_slc(need_rem_bits, op2.template slc<t_width>(0));
-    if(need_rem_bits)
-      r_pre_rnd[0] = sticky_bit;
-
-    bool shift_r1 = round(r_pre_rnd);
-    e_t = r_zero ? 0 : e_t + shift_r1;
-    if(!(e_t < 0) & !!(e_t >> E-1)) {
-      e = MAX_EXP;
-      m = m < 0 ? value<AC_VAL_MIN>(m) : value<AC_VAL_MAX>(m);
-    } else {
-      e = e_t;
-    }
-  }
-
-public:
-  template<AC_FL_T(2)>
-  ac_float(const AC_FL(2) &op, bool assert_on_overflow=false, bool assert_on_rounding=false) {
-    typedef AC_FL(2) fl2_t;
-    const int min_exp2 = fl2_t::MIN_EXP;
-    const int max_exp2 = fl2_t::MAX_EXP;
-    assign_from<min_exp2,max_exp2>(op.m, op.e, false, false);
-  }
-
-  ac_float(const ac_fixed<W,I,S> &m2, const ac_int<E,true> &e2, bool normalize=true) {
-    m = m2;
-    e = e2;
-    if(normalize)
-      this->normalize();
-    else
-      e &= ac_int<1,true>(!!m);
-  }
-
-  template<int WFX, int IFX, bool SFX, int E2>
-  ac_float(const ac_fixed<WFX,IFX,SFX> &m2, const ac_int<E2,true> &e2, bool normalize=true) {
-    enum { WF2 = WFX+!SFX, IF2 = IFX+!SFX };
-    ac_float<WF2,IF2,E2>  f(ac_fixed<WF2,IF2,true>(m2), e2, normalize);
-    *this = f;
-  }
-
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &op) {
-    assign_from<0,0>(ac_fixed<WFX+!SFX,IFX+!SFX,true>(op), 0, false, true);
-  }
-
-  template<int WI, bool SI>
-  ac_float(const ac_int<WI,SI> &op) {
-    *this = ac_fixed<WI,WI,SI>(op);
-  }
-
-  inline ac_float( bool b ) { *this = (ac_int<1,false>) b; }
-  inline ac_float( char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_float( signed char b ) { *this = (ac_int<8,true>) b; }
-  inline ac_float( unsigned char b ) { *this = (ac_int<8,false>) b; }
-  inline ac_float( signed short b ) { *this = (ac_int<16,true>) b; }
-  inline ac_float( unsigned short b ) { *this = (ac_int<16,false>) b; }
-  inline ac_float( signed int b ) { *this = (ac_int<32,true>) b; }
-  inline ac_float( unsigned int b ) { *this = (ac_int<32,false>) b; }
-  inline ac_float( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
-  inline ac_float( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
-  inline ac_float( Slong b ) { *this = (ac_int<64,true>) b; }
-  inline ac_float( Ulong b ) { *this = (ac_int<64,false>) b; }
-
-  // Explicit conversion functions to ac_int and ac_fixed
-  inline typename rt_unary::to_ac_fixed_t to_ac_fixed() const {
-    typename rt_unary::to_ac_fixed_t r = m;
-    r <<= e;
-    return r;
-  }
-  inline typename rt_unary::to_ac_int_t to_ac_int() const {
-    return to_ac_fixed().to_ac_int();
-  }
-
-  // Explicit conversion functions to C built-in types -------------
-  inline int to_int() const { return to_ac_int().to_int(); }
-  inline unsigned to_uint() const { return to_ac_int().to_uint(); }
-  inline long to_long() const { return (signed long) to_ac_int().to_int64(); }
-  inline unsigned long to_ulong() const { return (unsigned long) to_ac_int().to_uint64(); }
-  inline Slong to_int64() const { return to_ac_int().to_int64(); }
-  inline Ulong to_uint64() const { return to_ac_int().to_uint64(); }
-  inline float to_float() const { return ldexpf(m.to_double(), exp()); }
-  inline double to_double() const { return ldexp(m.to_double(), exp()); }
-
-  const ac_fixed<W,I,S> mantissa() const { return m; }
-  const ac_int<E,true> exp() const { return e; }
-  bool normalize() {
-    bool all_sign;
-    int ls = m.leading_sign(all_sign);
-    bool m_zero = all_sign & !m[0];
-    const int max_shift_left = (1 << (E-1)) + e;
-    bool normal = ls <= max_shift_left;
-    int shift_l = normal ? ls : max_shift_left;
-    m <<= shift_l;
-    e = ac_int<1,true>(!m_zero) & (e - shift_l);
-    return normal;
-  }
-
-  ac_float( double d, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
-    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cdouble_t::width + I_EXT - 1,  };
-    ac_private::ac_float_cdouble_t t = ac_private::double_to_ac_float(d);
-    ac_float r(t, assert_on_overflow, assert_on_rounding);
-    *this = r;
-  }
-
-  ac_float( float f, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
-    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cfloat_t::width + I_EXT - 1,  };
-    ac_private::ac_float_cfloat_t t = ac_private::float_to_ac_float(f);
-    ac_float r(t, assert_on_overflow, assert_on_rounding);
-    *this = r;
-  }
-
-  template<AC_FL_T(2)>
-  bool compare(const AC_FL(2) &op2, bool *gt) const {
-    typedef ac_fixed<W2,I,S2> fx2_t;
-    typedef typename ac_fixed<W,I,S>::template rt_T< fx2_t >::logic fx_t;
-    typedef ac_fixed<fx_t::width,fx_t::i_width,false> fxu_t;
-
-    fx2_t op2_m_0;
-    op2_m_0.set_slc(0, op2.m.template slc<W2>(0));
-
-    fx_t op1_m = m;
-    fx_t op2_m = op2_m_0;
-    int e_dif = exp() - op2.exp() + I - I2;
-    bool op2_m_neg = op2_m[fx_t::width-1];
-    fx_t out_bits = op2_m ^ ((op2_m_neg & e_dif < 0) ? ~fx_t(0) : fx_t(0));
-    out_bits &= ~(fxu_t(~fxu_t(0)) << e_dif);
-    op2_m >>= e_dif;
-    bool overflow = e_dif < 0 & !!out_bits | op2_m_neg ^ op2_m[fx_t::width-1];
-
-    *gt = overflow & op2_m_neg | !overflow & op1_m > op2_m;
-    bool eq = op1_m == op2_m & !overflow & !out_bits;
-    return eq;
-  }
-
-  template<AC_FL_T(2), AC_FL_T(R)>
-  void plus_minus(const AC_FL(2) &op2, AC_FL(R) &r, bool sub=false) const {
-    typedef AC_FL(2) op2_t;
-    enum { IT = AC_MAX(I,I2) };
-    typedef ac_fixed<W, IT, true> fx1_t;
-    typedef ac_fixed<W2, IT, true> fx2_t;
-    // covers fx1_t and r mantissas (adds additional LSBs if WR > W)
-    typedef typename fx1_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx1r_t;
-    // covers fx2_t and r mantissas (adds additional LSBs if WR > W2)
-    typedef typename fx2_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx2r_t;
-    // mt_t adds one integer bit for the plus
-    //  op1_m, op2_m, op_sl, sticky_bits
-    typedef typename fx1r_t::template rt_T<fx2r_t>::plus mt_t;
-
-    const bool round_bit_needed = QR != AC_TRN;
-    const bool remaining_bits_needed = !(QR == AC_TRN || QR == AC_RND);
-
-    const int w_r_with_round_bits = WR + round_bit_needed;
-
-    // naming: sn = subnormal, n = normal, wc = worst case
-    // worst case (wc) normalize is when one operand has smallest subnormal
-    //   and other operand is shifted right so that its MSB lines up with LSB of subnormal
-    const int power_smallest_sn1 = I - W - (1 << (E-1));
-    const int power_smallest_sn2 = I2 - W2 - (1 << (E2-1));
-    const int power_smallest_sn_dif1 = AC_MAX(0,power_smallest_sn2 - power_smallest_sn1);
-    const int power_smallest_sn_dif2 = AC_MAX(0,power_smallest_sn1 - power_smallest_sn2);
-    const int wc_norm_shift1 = W2-1 + AC_MIN(power_smallest_sn_dif1, W-1);
-    const int wc_norm_shift2 = W-1 + AC_MIN(power_smallest_sn_dif2, W2-1);
-    const int wc_sn_norm_shift = AC_MAX(wc_norm_shift1, wc_norm_shift2);
-    const int w_sn_overlap = wc_sn_norm_shift + 1;
-
-    // cases when one operand is subnormal and other is shifted right and does not overlap bits
-    //   subnormal op could be normalized by width-1 bits
-    const int w_sn_no_overlap1 = W + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif2);
-    const int w_sn_no_overlap2 = W2 + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif1);
-    const int w_sn_no_overlap = AC_MAX(w_sn_no_overlap1, w_sn_no_overlap2);
-
-    const int w_sn = AC_MAX(w_sn_overlap, w_sn_no_overlap);
-
-    // For example 0100 + (1000 0001 >> 1) = 0000 0000 1,  wc_n_norm_shift = max(4,8)
-    const int msb0h1 = I-1 + (int) MAX_EXP;
-    const int msb1h1 = msb0h1-1;
-    const int msb0l1 = I-1 + (int) MIN_EXP;
-    const int msb1l1 = msb0h1-1;
-    const int msb0h2 = I2-1 + (int) op2_t::MAX_EXP;
-    const int msb1h2 = msb0h2-1;
-    const int msb0l2 = I2-1 + (int) op2_t::MIN_EXP;
-    const int msb1l2 = msb0h2-1;
-    // bit W-1 overlap with bit W2-2
-    const bool msb_overlap1 = msb1h2 >= msb0h1 && msb0h1 <= msb1l2
-      || msb1h2 >= msb0l1 && msb0l1 <= msb1l2
-      || msb0h1 >= msb1h2 && msb1h2 >= msb0l1;
-    // bit W2-1 overlap with bit W1-2
-    const bool msb_overlap2 = msb1h1 >= msb0h2 && msb0h2 <= msb1l1
-      || msb1h1 >= msb0l2 && msb0l2 <= msb1l1
-      || msb0h2 >= msb1h1 && msb1h1 >= msb0l2;
-    const bool msb_overlap = msb_overlap1 || msb_overlap2;
-    const int wc_n_norm_shift = AC_MAX(W,W2);
-    const int w_n_msb_overlap = msb_overlap ? wc_n_norm_shift + 1 : 0;
-    // addition of two numbers of different sign can result in a normalization by 1 (therefore + 1)
-    const int w_n_no_msb_overlap = w_r_with_round_bits + 1;
-    const int w_n = AC_MAX(w_n_msb_overlap, w_n_no_msb_overlap);
-
-    // +1 is to prevent overflow during addition
-    const int tr_t_width = AC_MAX(w_n, w_sn) + 1;
-    typedef ac_fixed<tr_t_width,IT+1,true> add_t;
-
-    const int min_E = (int) MIN_EXP + I-IT;
-    const int min_E2 = (int) AC_FL(2)::MIN_EXP + I2-IT;
-    const int min_ET = AC_MIN(min_E, min_E2);
-
-    const int max_E = (int) MAX_EXP + I-IT;
-    const int max_E2 = (int) AC_FL(2)::MAX_EXP + I2-IT;
-    const int max_ET = AC_MAX(max_E, max_E2);
-
-    ac_fixed<mt_t::width, I+1, mt_t::sign> op1_m_0 = m;
-    mt_t op1_m = 0;
-    op1_m.set_slc(0, op1_m_0.template slc<mt_t::width>(0));
-    int op1_e = exp() + I-IT;
-
-    ac_fixed<mt_t::width, I2+1, mt_t::sign> op2_m_0 = op2.m;
-    mt_t op2_m = 0;
-    op2_m.set_slc(0, op2_m_0.template slc<mt_t::width>(0));
-    if(sub)
-      op2_m = -op2_m;
-    int op2_e = op2.exp() + I2-IT;
-
-    bool op1_zero = operator !();
-    bool op2_zero = !op2;
-    int e_dif = op1_e - op2_e;
-    bool e1_lt_e2 = e_dif < 0;
-    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
-
-    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
-    mt_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
-
-    bool sticky_bit = false;
-    if(remaining_bits_needed) {
-      mt_t shifted_out_bits = op_lshift;
-      // bits that are shifted out of a add_t (does not include potential 3 spare bits)
-      shifted_out_bits &= ~((~add_t(0)) << e_dif);
-      sticky_bit = !!shifted_out_bits;
-    }
-    op_lshift >>= e_dif;
-
-    add_t add_r = op_lshift + op_no_shift;
-    int e_t = (e1_lt_e2 & !op2_zero | op1_zero ? op2_e : op1_e);
-
-    r.template assign_from<min_ET,max_ET>(add_r, e_t, sticky_bit, true);
-  }
-
-  template<AC_FL_T(1), AC_FL_T(2)>
-  ac_float add(const AC_FL(1) &op1, const AC_FL(2) &op2) {
-    op1.plus_minus(op2, *this);
-    return *this;
-  }
-
-  template<AC_FL_T(1), AC_FL_T(2)>
-  ac_float sub(const AC_FL(1) &op1, const AC_FL(2) &op2) {
-    op1.plus_minus(op2, *this, true);
-    return *this;
-  }
-
-  typename rt_unary::neg abs() const {
-    typedef typename rt_unary::neg r_t;
-    r_t r;
-    r.m = is_neg() ? -m : r_t::mant_t(m);
-    r.e = e;
-    return r;
-  }
-
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-  // These will be changed!!! For now only enable to explore integration with ac_complex
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::plus operator +(const AC_FL(2) &op2) const {
-    typename rt< AC_FL_TV0(2) >::plus r;
-    plus_minus(op2, r);
-    return r;
-  }
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::minus operator -(const AC_FL(2) &op2) const {
-    typename rt< AC_FL_TV0(2) >::minus r;
-    plus_minus(op2, r, true);
-    return r;
-  }
-#endif
-
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::mult operator *(const AC_FL(2) &op2) const {
-    typedef typename rt< AC_FL_TV0(2) >::mult r_t;
-    r_t r(m*op2.m, exp()+op2.exp(), false);
-    return r;
-  }
-
-  template<AC_FL_T(2)>
-  typename rt< AC_FL_TV0(2) >::div operator /(const AC_FL(2) &op2) const {
-    typename rt< AC_FL_TV0(2) >::div r(m/op2.m, exp()-op2.exp());
-    return r;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator +=(const AC_FL(2) &op2) {
-    ac_float r;
-    plus_minus(op2, r);
-    *this = r;
-    return *this;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator -=(const AC_FL(2) &op2) {
-    ac_float r;
-    plus_minus(op2, r, true);
-    *this = r;
-    return *this;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator *=(const AC_FL(2) &op2) {
-    *this = *this * op2;
-    return *this;
-  }
-  template<AC_FL_T(2)>
-  ac_float &operator /=(const AC_FL(2) &op2) {
-    *this = *this / op2;
-    return *this;
-  }
-  ac_float operator + () const {
-    return *this;
-  }
-  typename rt_unary::neg operator - () const {
-    typename rt_unary::neg r;
-    r.m = -m;
-    r.e = e;
-    return r;
-  }
-  bool operator ! () const {
-    return !m;
-  }
-
-  // Shift --------------------------------------------------------------------
-  template<int WI, bool SI>
-  typename rt_i<WI,SI>::lshift operator << ( const ac_int<WI,SI> &op2 ) const {
-    typename rt_i<WI,SI>::lshift r;
-    r.m = m;
-    r.e = e + op2;
-    return r;
-  }
-  template<int WI, bool SI>
-  typename rt_i<WI,SI>::rshift operator >> ( const ac_int<WI,SI> &op2 ) const {
-    typename rt_i<WI,SI>::rshift r;
-    r.m = m;
-    r.e = e - op2;
-    return r;
-  }
-  // Shift assign -------------------------------------------------------------
-  template<int WI, bool SI>
-  ac_float &operator <<= ( const ac_int<WI,SI> &op2 ) {
-    *this = operator << (op2);
-    return *this;
-  }
-  template<int WI, bool SI>
-  ac_float &operator >>= ( const ac_int<WI,SI> &op2 ) {
-    *this = operator >> (op2);
-    return *this;
-  }
-
-  template<AC_FL_T(2)>
-  bool operator == (const AC_FL(2) &f) const {
-    bool gt;
-    return compare(f, &gt);
-  }
-  template<AC_FL_T(2)>
-  bool operator != (const AC_FL(2) &f) const {
-    return !operator == (f);
-  }
-  template<AC_FL_T(2)>
-  bool operator < (const AC_FL(2) &f) const {
-    bool gt;
-    bool eq = compare(f, &gt);
-    return !(eq | gt);
-  }
-  template<AC_FL_T(2)>
-  bool operator >= (const AC_FL(2) &f) const {
-    return !operator < (f);
-  }
-  template<AC_FL_T(2)>
-  bool operator > (const AC_FL(2) &f) const {
-    bool gt;
-    compare(f, &gt);
-    return gt;
-  }
-  template<AC_FL_T(2)>
-  bool operator <= (const AC_FL(2) &f) const {
-    return !operator > (f);
-  }
-
-  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false, bool hw=true) const {
-    // TODO: printing decimal with exponent
-    if(!hw) {
-      ac_fixed<W,0,S> mantissa;
-      mantissa.set_slc(0, m.template slc<W>(0));
-      std::string r = mantissa.to_string(base_rep, sign_mag);
-      r += "e2";
-      r += (e + I).to_string(base_rep, sign_mag | base_rep == AC_DEC);
-      return r;
-    } else {
-      std::string r = m.to_string(base_rep, sign_mag);
-      if(base_rep != AC_DEC)
-        r += "_";
-      r += "e2";
-      if(base_rep != AC_DEC)
-        r += "_";
-      if(E)
-        r += e.to_string(base_rep, sign_mag | base_rep == AC_DEC);
-      else
-        r += "0";
-      return r;
-    }
-  }
-
-  inline static std::string type_name() {
-    const char *tf[] = {"false", "true" };
-    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV" };
-    std::string r = "ac_float<";
-    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
-    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
-    r += ac_int<32,true>(E).to_string(AC_DEC) + ',';
-    r += tf[S];
-    r += ',';
-    r += q[Q];
-    r += '>';
-    return r;
-  }
-
-  template<ac_special_val V>
-  inline ac_float &set_val() {
-    m.template set_val<V>();
-    if(V == AC_VAL_MIN)
-      e.template set_val<AC_VAL_MAX>();
-    else if(V == AC_VAL_QUANTUM)
-      e.template set_val<AC_VAL_MIN>();
-    else
-      e.template set_val<V>();
-    return *this;
-  }
-};
-
-namespace ac_private {
-  template<typename T>
-  bool ac_fpclassify(T x, bool &inf) {
-    bool nan = !(x==x);
-    if(!nan) {
-      T d = x - x;
-      inf = !(d==d);
-    }
-    return nan;
-  }
-
-  inline ac_float_cdouble_t double_to_ac_float(double d) {
-    typedef ac_float_cdouble_t r_t;
-#ifndef __SYNTHESIS__
-    bool inf;
-    bool nan = ac_fpclassify(d, inf);
-    if(nan)
-      AC_ASSERT(0, "In conversion from double to ac_float: double is NaN");
-    else if(inf)
-      AC_ASSERT(0, "In conversion from double to ac_float: double is Infinite");
-#endif
-    r_t::exp_t exp;
-    r_t::mant_t mant = ac::frexp_d(d, exp);
-    return r_t(mant, exp, false);
-  }
-
-  inline ac_float_cfloat_t float_to_ac_float(float f) {
-    typedef ac_float_cfloat_t r_t;
-#ifndef __SYNTHESIS__
-    bool inf;
-    bool nan = ac_fpclassify(f, inf);
-    if(nan)
-      AC_ASSERT(0, "In conversion from float to ac_float: float is NaN");
-    else if(inf)
-      AC_ASSERT(0, "In conversion from float to ac_float: float is Infinite");
-#endif
-    r_t::exp_t exp;
-    r_t::mant_t mant = ac::frexp_f(f, exp);
-    return r_t(mant, exp, false);
-  }
-};
-
-namespace ac {
-  template<typename T>
-  struct ac_float_represent {
-    typedef typename ac_fixed_represent<T>::type fx_t;
-    typedef ac_float<fx_t::width+!fx_t::sign,fx_t::i_width+!fx_t::sign,1,fx_t::q_mode> type;
-  };
-  template<> struct ac_float_represent<float> {
-    typedef ac_private::ac_float_cfloat_t type;
-  };
-  template<> struct ac_float_represent<double> {
-    typedef ac_private::ac_float_cdouble_t type;
-  };
-}
-
-namespace ac_private {
-  // with T == ac_float
-  template< AC_FL_T0(2) >
-  struct rt_ac_float_T< AC_FL0(2) > {
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-  // with T == ac_fixed
-  template<int WFX, int IFX, bool SFX>
-  struct rt_ac_float_T< ac_fixed<WFX,IFX,SFX> > {
-    // For now E2 > 0
-    enum { E2 = 1, S2 = true, W2 = WFX + !SFX, I2 = IFX + !SFX };
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-  // with T == ac_int
-  template<int WI, bool SI>
-  struct rt_ac_float_T< ac_int<WI,SI> > {
-    // For now E2 > 0
-    enum { E2 = 1, S2 = true, I2 = WI + !SI, W2 = I2 };
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-
-  // Multiplication is optimizable, general operator +/- is not yet supported
-  template<typename T>
-  struct rt_ac_float_T< c_type<T> > {
-    // For now E2 > 0
-    enum { SCT = c_type_params<T>::S, S2 = true, W2 = c_type_params<T>::W + !SCT, I2 = c_type_params<T>::I + !SCT, E2 = AC_MAX(1, c_type_params<T>::E) };
-    typedef AC_FL0(2) fl2_t;
-    template< AC_FL_T0() >
-    struct op1 {
-      typedef AC_FL0() fl_t;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
-      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
-      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
-    };
-  };
-}
-
-// Stream --------------------------------------------------------------------
-
-#ifndef __SYNTHESIS__
-template<AC_FL_T()>
-inline std::ostream& operator << (std::ostream &os, const AC_FL() &x) {
-  os << x.to_string(AC_DEC);
-  return os;
-}
-#endif
-
-#define FL_BIN_OP_WITH_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
-  template< AC_FL_T() > \
-  inline typename AC_FL()::template rt_T2<C_TYPE>::RTYPE operator BIN_OP ( C_TYPE c_op, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return fl2_t(c_op).operator BIN_OP (op);  \
-  } \
-  template< AC_FL_T() > \
-  inline typename AC_FL()::template rt_T<C_TYPE>::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE c_op) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return op.operator BIN_OP (fl2_t(c_op));  \
-  }
-
-#define FL_REL_OP_WITH_CTYPE(REL_OP, C_TYPE)  \
-  template< AC_FL_T() > \
-  inline bool operator REL_OP ( const AC_FL() &op, C_TYPE op2) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return op.operator REL_OP (fl2_t(op2));  \
-  }  \
-  template< AC_FL_T() > \
-  inline bool operator REL_OP ( C_TYPE op2, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return fl2_t(op2).operator REL_OP (op);  \
-  }
-
-#define FL_ASSIGN_OP_WITH_CTYPE_2(ASSIGN_OP, C_TYPE)  \
-  template< AC_FL_T() > \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE op2) {  \
-    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
-    return op.operator ASSIGN_OP (fl2_t(op2));  \
-  }
-
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
-  FL_BIN_OP_WITH_CTYPE(+, C_TYPE, plus) \
-  FL_BIN_OP_WITH_CTYPE(-, C_TYPE, minus)
-#else
-#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE)
-#endif
-
-#define FL_OPS_WITH_CTYPE(C_TYPE) \
-  FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
-  FL_BIN_OP_WITH_CTYPE(*, C_TYPE, mult) \
-  FL_BIN_OP_WITH_CTYPE(/, C_TYPE, div) \
-  \
-  FL_REL_OP_WITH_CTYPE(==, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(!=, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(>, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(>=, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(<, C_TYPE) \
-  FL_REL_OP_WITH_CTYPE(<=, C_TYPE) \
-  \
-  FL_ASSIGN_OP_WITH_CTYPE_2(+=, C_TYPE) \
-  FL_ASSIGN_OP_WITH_CTYPE_2(-=, C_TYPE) \
-  FL_ASSIGN_OP_WITH_CTYPE_2(*=, C_TYPE) \
-  FL_ASSIGN_OP_WITH_CTYPE_2(/=, C_TYPE)
-
-#define FL_SHIFT_OP_WITH_INT_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
-  template< AC_FL_T() > \
-  inline typename AC_FL()::template rt_i< ac_private::c_type_params<C_TYPE>::W, ac_private::c_type_params<C_TYPE>::S >::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE i_op) {  \
-    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
-    return op.operator BIN_OP (i_t(i_op));  \
-  }
-
-#define FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(ASSIGN_OP, C_TYPE)  \
-  template< AC_FL_T() > \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE i_op) {  \
-    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
-    return op.operator ASSIGN_OP (i_t(i_op));  \
-  }
-
-#define FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE) \
-  FL_SHIFT_OP_WITH_INT_CTYPE(>>, C_TYPE, rshift) \
-  FL_SHIFT_OP_WITH_INT_CTYPE(<<, C_TYPE, lshift) \
-  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(>>=, C_TYPE) \
-  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(<<=, C_TYPE)
-
-#define FL_OPS_WITH_INT_CTYPE(C_TYPE) \
-  FL_OPS_WITH_CTYPE(C_TYPE) \
-  FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE)
-
-// --------------------------------------- End of Macros for Binary Operators with C Floats
-
-    // Binary Operators with C Floats --------------------------------------------
-    FL_OPS_WITH_CTYPE(float)
-    FL_OPS_WITH_CTYPE(double)
-    FL_OPS_WITH_INT_CTYPE(bool)
-    FL_OPS_WITH_INT_CTYPE(char)
-    FL_OPS_WITH_INT_CTYPE(signed char)
-    FL_OPS_WITH_INT_CTYPE(unsigned char)
-    FL_OPS_WITH_INT_CTYPE(short)
-    FL_OPS_WITH_INT_CTYPE(unsigned short)
-    FL_OPS_WITH_INT_CTYPE(int)
-    FL_OPS_WITH_INT_CTYPE(unsigned int)
-    FL_OPS_WITH_INT_CTYPE(long)
-    FL_OPS_WITH_INT_CTYPE(unsigned long)
-    FL_OPS_WITH_INT_CTYPE(Slong)
-    FL_OPS_WITH_INT_CTYPE(Ulong)
-    // -------------------------------------- End of Binary Operators with C Floats
-
-// Macros for Binary Operators with ac_int --------------------------------------------
-
-#define FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return fl2_t(i_op).operator BIN_OP (op);  \
-  }
-
-#define FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_int<WI,SI> &i_op) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return op.operator BIN_OP (fl2_t(i_op));  \
-  }
-
-#define FL_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
-  FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
-  FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
-
-#define FL_REL_OP_WITH_AC_INT(REL_OP)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline bool operator REL_OP ( const AC_FL() &op, const ac_int<WI,SI> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return op.operator REL_OP (fl2_t(op2));  \
-  }  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return fl2_t(op2).operator REL_OP (op);  \
-  }
-
-#define FL_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
-  template< AC_FL_T(), int WI, bool SI> \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_int<WI,SI> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
-    return op.operator ASSIGN_OP (fl2_t(op2));  \
-  }
-
-// -------------------------------------------- End of Macros for Binary Operators with ac_int
-
-    // Binary Operators with ac_int --------------------------------------------
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-    FL_BIN_OP_WITH_AC_INT(+, plus)
-    FL_BIN_OP_WITH_AC_INT(-, minus)
-#endif
-    FL_BIN_OP_WITH_AC_INT(*, mult)
-    FL_BIN_OP_WITH_AC_INT(/, div)
-
-    FL_REL_OP_WITH_AC_INT(==)
-    FL_REL_OP_WITH_AC_INT(!=)
-    FL_REL_OP_WITH_AC_INT(>)
-    FL_REL_OP_WITH_AC_INT(>=)
-    FL_REL_OP_WITH_AC_INT(<)
-    FL_REL_OP_WITH_AC_INT(<=)
-
-    FL_ASSIGN_OP_WITH_AC_INT(+=)
-    FL_ASSIGN_OP_WITH_AC_INT(-=)
-    FL_ASSIGN_OP_WITH_AC_INT(*=)
-    FL_ASSIGN_OP_WITH_AC_INT(/=)
-    FL_ASSIGN_OP_WITH_AC_INT(%=)
-    // -------------------------------------- End of Binary Operators with ac_int
-
-// Macros for Binary Operators with ac_fixed --------------------------------------------
-
-#define FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const ac_fixed<WF,IF,SF,QF,OF> &f_op, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return fl2_t(f_op).operator BIN_OP (op);  \
-  }
-
-#define FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &f_op) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return op.operator BIN_OP (fl2_t(f_op));  \
-  }
-
-#define FL_BIN_OP_WITH_AC_FIXED(BIN_OP, RTYPE)  \
-  FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE) \
-  FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)
-
-#define FL_REL_OP_WITH_AC_FIXED(REL_OP)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline bool operator REL_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return op.operator REL_OP (fl2_t(op2));  \
-  }  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline bool operator REL_OP ( ac_fixed<WF,IF,SF,QF,OF> &op2, const AC_FL() &op) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return fl2_t(op2).operator REL_OP (op);  \
-  }
-
-#define FL_ASSIGN_OP_WITH_AC_FIXED(ASSIGN_OP)  \
-  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
-  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
-    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
-    return op.operator ASSIGN_OP (fl2_t(op2));  \
-  }
-
-// -------------------------------------------- End of Macros for Binary Operators with ac_fixed
-
-    // Binary Operators with ac_fixed --------------------------------------------
-#ifdef __AC_FLOAT_ENABLE_ALPHA
-    FL_BIN_OP_WITH_AC_FIXED(+, plus)
-    FL_BIN_OP_WITH_AC_FIXED(-, minus)
-#endif
-    FL_BIN_OP_WITH_AC_FIXED(*, mult)
-    FL_BIN_OP_WITH_AC_FIXED(/, div)
-
-    FL_REL_OP_WITH_AC_FIXED(==)
-    FL_REL_OP_WITH_AC_FIXED(!=)
-    FL_REL_OP_WITH_AC_FIXED(>)
-    FL_REL_OP_WITH_AC_FIXED(>=)
-    FL_REL_OP_WITH_AC_FIXED(<)
-    FL_REL_OP_WITH_AC_FIXED(<=)
-
-    FL_ASSIGN_OP_WITH_AC_FIXED(+=)
-    FL_ASSIGN_OP_WITH_AC_FIXED(-=)
-    FL_ASSIGN_OP_WITH_AC_FIXED(*=)
-    FL_ASSIGN_OP_WITH_AC_FIXED(/=)
-    // -------------------------------------- End of Binary Operators with ac_fixed
-
-// Global templatized functions for easy initialization to special values
-template<ac_special_val V, AC_FL_T()>
-inline AC_FL() value( AC_FL() ) {
-  AC_FL() r;
-  return r.template set_val<V>();
-}
-
-namespace ac {
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, AC_FL_T() >
-  inline bool init_array( AC_FL() *a, int n) {
-    AC_FL0() t;
-    t.template set_val<V>();
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_FLOAT_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2013-2019, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+//  Source:         ac_float.h
+//  Description:    class for floating point operation handling in C++
+//  Author:         Andres Takach, Ph.D.
+
+#ifndef __AC_FLOAT_H
+#define __AC_FLOAT_H
+
+#include <ac_fixed.h>
+
+#ifndef __SYNTHESIS__
+#include <cmath>
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
+#error GCC version 3 or greater is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
+#error Microsoft Visual Studio 8 or newer is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4003 4127 4308 4365 4514 4800 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wparentheses"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wparentheses"
+#pragma clang diagnostic ignored "-Wlogical-op-parentheses"
+#pragma clang diagnostic ignored "-Wbitwise-op-parentheses"
+#endif
+
+// for safety
+#if (defined(E) || defined(WF) || defined(IF) || defined(SF))
+#error One or more of the following is defined: E, WF, IF, SF. Definition conflicts with their usage as template parameters.
+#error DO NOT use defines before including third party header files.
+#endif
+
+#define AC_FL(v) ac_float<W##v,I##v,E##v,Q##v>
+#define AC_FL0(v) ac_float<W##v,I##v,E##v>
+#define AC_FL_T(v) int W##v, int I##v, int E##v, ac_q_mode Q##v
+#define AC_FL_TV(v) W##v, I##v, E##v, Q##v
+#define AC_FL_T0(v) int W##v, int I##v, int E##v
+#define AC_FL_TV0(v) W##v, I##v, E##v
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+template<int W, int I, int E, ac_q_mode Q=AC_TRN> class ac_float;
+
+namespace ac_private {
+
+  typedef ac_float<54,2,11> ac_float_cdouble_t;
+  typedef ac_float<25,2,8> ac_float_cfloat_t;
+
+  template<typename T>
+  struct rt_ac_float_T {
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename T::template rt_T<fl_t>::mult mult;
+      typedef typename T::template rt_T<fl_t>::plus plus;
+      typedef typename T::template rt_T<fl_t>::minus2 minus;
+      typedef typename T::template rt_T<fl_t>::minus minus2;
+      typedef typename T::template rt_T<fl_t>::logic logic;
+      typedef typename T::template rt_T<fl_t>::div2 div;
+      typedef typename T::template rt_T<fl_t>::div div2;
+    };
+  };
+  // specializations after definition of ac_float
+
+  inline ac_float_cdouble_t double_to_ac_float(double d);
+  inline ac_float_cfloat_t float_to_ac_float(float f);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//  ac_float
+//////////////////////////////////////////////////////////////////////////////
+
+template< AC_FL_T() >
+class ac_float {
+  enum { NO_UN = true, S = true, S2 = true, SR = true };
+public:
+  typedef ac_fixed<W,I,S> mant_t;
+  typedef ac_int<E,true> exp_t;
+  mant_t m;
+  exp_t e;
+
+  void set_mantissa(const ac_fixed<W,I,S> &man) { m = man; }
+  void set_exp(const ac_int<E,true> &exp) { if(E) e = exp; }
+
+private:
+  inline bool is_neg() const { return m < 0; }   // is_neg would be more efficient
+
+  enum {NZ_E = !!E, MIN_EXP = -(NZ_E << (E-NZ_E)), MAX_EXP = (1 << (E-NZ_E))-1};
+
+public:
+  static const int width = W;
+  static const int i_width = I;
+  static const int e_width = E;
+  static const bool sign = S;
+  static const ac_q_mode q_mode = Q;
+  static const ac_o_mode o_mode = AC_SAT;
+
+  template< AC_FL_T0(2) >
+  struct rt {
+    enum {
+      // need to validate
+      F=W-I,
+      F2=W2-I2,
+      mult_w = W+W2,
+      mult_i = I+I2,
+      mult_e = AC_MAX(E,E2)+1,
+      mult_s = S||S2,
+      plus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      plus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      plus_e = AC_MAX(E,E2),
+      plus_s = S||S2,
+      minus_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1+AC_MAX(F,F2),
+      minus_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+1,
+      minus_e = AC_MAX(E,E2),
+      minus_s = true,
+      div_w = W+AC_MAX(W2-I2,0)+S2,
+      div_i = I+(W2-I2)+S2,
+      div_e = AC_MAX(E,E2)+1,
+      div_s = S||S2,
+      logic_w = AC_MAX(I+(S2&&!S),I2+(S&&!S2))+AC_MAX(F,F2),
+      logic_i = AC_MAX(I+(S2&&!S),I2+(S&&!S2)),
+      logic_s = S||S2,
+      logic_e = AC_MAX(E,E2)
+    };
+    typedef ac_float<mult_w, mult_i, mult_e> mult;
+    typedef ac_float<plus_w, plus_i, plus_e> plus;
+    typedef ac_float<minus_w, minus_i, minus_e> minus;
+    typedef ac_float<logic_w, logic_i, logic_e> logic;
+    typedef ac_float<div_w, div_i, div_e> div;
+    typedef ac_float arg1;
+
+  };
+
+  template<int WI, bool SI>
+  struct rt_i {
+    enum {
+      lshift_w = W,
+      lshift_i = I,
+      lshift_s = S,
+      lshift_e_0 = exp_t::template rt<WI,SI>::plus::width,
+      lshift_e = AC_MIN(lshift_e_0, 24),
+      rshift_w = W,
+      rshift_i = I,
+      rshift_s = S,
+      rshift_e_0 = exp_t::template rt<WI,SI>::minus::width,
+      rshift_e = AC_MIN(rshift_e_0, 24)
+    };
+    typedef ac_float<lshift_w, lshift_i, lshift_e> lshift;
+    typedef ac_float<rshift_w, rshift_i, rshift_e> rshift;
+  };
+
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus2;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div2;
+    typedef ac_float arg1;
+  };
+
+  template<typename T>
+  struct rt_T2 {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::mult mult;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::plus plus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus2 minus;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::minus minus2;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::logic logic;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div2 div;
+    typedef typename ac_private::rt_ac_float_T<map_T>::template op1< AC_FL_TV0() >::div div2;
+    typedef ac_float arg1;
+  };
+
+  struct rt_unary {
+    enum {
+      neg_w = W+1,
+      neg_i = I+1,
+      neg_e = E,
+      neg_s = true,
+      mag_sqr_w = 2*W-S + NO_UN,
+      mag_sqr_i = 2*I-S + NO_UN,
+      mag_sqr_e = E,
+      mag_sqr_s = false | NO_UN,
+      mag_w = W+S + NO_UN,
+      mag_i = I+S + NO_UN,
+      mag_e = E,
+      mag_s = false | NO_UN,
+      to_fx_i = I + MAX_EXP,
+      to_fx_w = W + MAX_EXP - MIN_EXP,
+      to_fx_s = S,
+      to_i_w = AC_MAX(to_fx_i,1),
+      to_i_s = S
+    };
+    typedef ac_float<neg_w, neg_i, neg_e> neg;
+    typedef ac_float<mag_sqr_w, mag_sqr_i, mag_sqr_e> mag_sqr;
+    typedef ac_float<mag_w, mag_i, mag_e> mag;
+    template<unsigned N>
+    struct set {
+      enum { sum_w = W + ac::log2_ceil<N>::val, sum_i = (sum_w-W) + I, sum_e = E, sum_s = S};
+      typedef ac_float<sum_w, sum_i, sum_e> sum;
+    };
+    typedef ac_fixed<to_fx_w, to_fx_i, to_fx_s> to_ac_fixed_t;
+    typedef ac_int<to_i_w, to_i_s> to_ac_int_t;
+  };
+
+  template<AC_FL_T(2)> friend class ac_float;
+
+  ac_float() {
+#if defined(AC_DEFAULT_IN_RANGE)
+#endif
+  }
+  ac_float(const ac_float &op) {
+    m = op.m;
+    e = op.e;
+  }
+
+private:
+  template<int W2>
+  bool round(const ac_fixed<W2,I,true> &op2, bool assert_on_rounding=false) {
+    const bool rnd = Q!=AC_TRN && Q!=AC_TRN_ZERO && W2 > W;
+    bool rnd_ovfl = false;
+    m = 0;
+    if(rnd) {
+      ac_fixed<W+1,I+1,true,Q> m_1 = op2;
+      // overflow because of rounding would lead to go from 001111  to 01000 (extra bit prevents it)
+      //   change from 01000 to 00100 and store 0100 in m
+      rnd_ovfl = !m_1[W] & m_1[W-1];
+      m_1[W-1] = m_1[W-1] & !rnd_ovfl;
+      m_1[W-2] = m_1[W-2] | rnd_ovfl;
+      m.set_slc(0, m_1.template slc<W>(0));
+      if(assert_on_rounding)
+        AC_ASSERT(m == op2, "Loss of precision due to Rounding");
+      return rnd_ovfl;
+    } else {
+      ac_fixed<W,I,true,Q> m_0 = op2;
+      m.set_slc(0, m_0.template slc<W>(0));
+      return false;
+    }
+  }
+
+  template<int min_exp2, int max_exp2, int W2, int I2, ac_q_mode Q2, ac_o_mode O2>
+  void assign_from(const ac_fixed<W2,I2,true,Q2,O2> &m2, int e2, bool sticky_bit, bool normalize, bool assert_on_rounding=false) {
+    const bool rnd = Q!=AC_TRN & Q!=AC_TRN_ZERO & W2 > W;
+    const bool need_rnd_bit = Q != AC_TRN;
+    const bool need_rem_bits = need_rnd_bit && Q != AC_RND;
+
+    const int msb_min_power = I-1 + MIN_EXP;
+    const int msb_min_power2 = I2-1 + min_exp2;
+    const int msb_min_power_dif = msb_min_power - msb_min_power2;
+    //   if > 0: target has additional negative exponent range
+    //     subnormal maybe be further normalized (done even if normalize==false)
+    //   if < 0: target has less negative exponent range
+    //     mantissa may need to be shifted right
+    //   in either case if source is unnormalized
+    //     normalization could take place
+
+    const int msb_max_power = I-1 + MAX_EXP;
+    const int msb_max_power2 = I2-1 + max_exp2 + rnd;
+    const int msb_max_power_dif = msb_max_power - msb_max_power2;
+
+    const bool may_shift_right = msb_min_power_dif > 0;
+    const int max_right_shift = may_shift_right ? msb_min_power_dif : 0;
+    const int t_width = W2 + (W >= W2 ? AC_MIN(W-W2+may_shift_right, max_right_shift) : 0);
+
+    int e_t = e2;
+    e_t += I2-I;
+    typedef ac_fixed<t_width,I2,true,Q2,O2> op2_t;
+    op2_t op2 = m2;
+    int ls = 0;
+    bool r_zero;
+    if(normalize) {
+      bool all_sign;
+      ls = m2.leading_sign(all_sign);
+      r_zero = all_sign & !m2[0];
+    } else if(msb_min_power_dif < 0 || msb_max_power_dif < 0 || W2 > W) {
+      // msb_min_power_dif < 0: src exponent less negative than trg exp represents
+      //   oportunity to further normalize value in trg representation
+      // msb_max_power_dif < 0: max target exp is less than max src exp
+      //   if un-normalized exp may overflow resulting in incorrect saturation
+      //     normalization is needed for correctness
+      // W2 > W
+      //   if un-normalized, extra bits may be incorrectly quantized away
+      const int msb_range_dif = AC_MAX(-msb_min_power_dif, -msb_max_power_dif);
+      const int msb_range_dif_norm_w = AC_MIN(msb_range_dif,W2-1);
+      const int extra_bits = AC_MAX(W2-W,0);
+      const int norm_w = AC_MAX(msb_range_dif_norm_w, extra_bits) + 1;
+      bool all_sign;
+      ls = m2.template slc<norm_w>(W2-norm_w).leading_sign(all_sign);
+      r_zero = all_sign & !m2[W2-1] & !(m2 << norm_w);
+    } else {
+      r_zero = !m2;
+    }
+    int actual_max_shift_left = (1 << (E-1)) + e_t;
+    if(may_shift_right && actual_max_shift_left < 0) {
+      const int shift_r_w = ac::nbits<max_right_shift>::val;
+      ac_int<shift_r_w,false> shift_r = -actual_max_shift_left;
+      if((1 << (E-1)) + min_exp2 + I2-I < 0 && need_rem_bits) {
+        op2_t shifted_out_bits = op2;
+        shifted_out_bits &= ~((~op2_t(0)) << shift_r);
+        sticky_bit |= !!shifted_out_bits;
+      }
+      op2 >>= shift_r;
+      e_t += shift_r;
+    } else {
+      bool shift_exponent_limited = ls >= actual_max_shift_left;
+      int shift_l = shift_exponent_limited ? actual_max_shift_left : (int) ls;
+      op2 <<= shift_l;
+      e_t = shift_exponent_limited ? MIN_EXP : e_t - ls;
+    }
+    ac_fixed<t_width+need_rem_bits,I,true> r_pre_rnd = 0;
+    r_pre_rnd.set_slc(need_rem_bits, op2.template slc<t_width>(0));
+    if(need_rem_bits)
+      r_pre_rnd[0] = sticky_bit;
+
+    bool shift_r1 = round(r_pre_rnd);
+    e_t = r_zero ? 0 : e_t + shift_r1;
+    if(!(e_t < 0) & !!(e_t >> E-1)) {
+      e = MAX_EXP;
+      m = m < 0 ? value<AC_VAL_MIN>(m) : value<AC_VAL_MAX>(m);
+    } else {
+      e = e_t;
+    }
+  }
+
+public:
+  template<AC_FL_T(2)>
+  ac_float(const AC_FL(2) &op, bool assert_on_overflow=false, bool assert_on_rounding=false) {
+    typedef AC_FL(2) fl2_t;
+    const int min_exp2 = fl2_t::MIN_EXP;
+    const int max_exp2 = fl2_t::MAX_EXP;
+    assign_from<min_exp2,max_exp2>(op.m, op.e, false, false);
+  }
+
+  ac_float(const ac_fixed<W,I,S> &m2, const ac_int<E,true> &e2, bool normalize=true) {
+    m = m2;
+    e = e2;
+    if(normalize)
+      this->normalize();
+    else
+      e &= ac_int<1,true>(!!m);
+  }
+
+  template<int WFX, int IFX, bool SFX, int E2>
+  ac_float(const ac_fixed<WFX,IFX,SFX> &m2, const ac_int<E2,true> &e2, bool normalize=true) {
+    enum { WF2 = WFX+!SFX, IF2 = IFX+!SFX };
+    ac_float<WF2,IF2,E2>  f(ac_fixed<WF2,IF2,true>(m2), e2, normalize);
+    *this = f;
+  }
+
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &op) {
+    assign_from<0,0>(ac_fixed<WFX+!SFX,IFX+!SFX,true>(op), 0, false, true);
+  }
+
+  template<int WI, bool SI>
+  ac_float(const ac_int<WI,SI> &op) {
+    *this = ac_fixed<WI,WI,SI>(op);
+  }
+
+  inline ac_float( bool b ) { *this = (ac_int<1,false>) b; }
+  inline ac_float( char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_float( signed char b ) { *this = (ac_int<8,true>) b; }
+  inline ac_float( unsigned char b ) { *this = (ac_int<8,false>) b; }
+  inline ac_float( signed short b ) { *this = (ac_int<16,true>) b; }
+  inline ac_float( unsigned short b ) { *this = (ac_int<16,false>) b; }
+  inline ac_float( signed int b ) { *this = (ac_int<32,true>) b; }
+  inline ac_float( unsigned int b ) { *this = (ac_int<32,false>) b; }
+  inline ac_float( signed long b ) { *this = (ac_int<ac_private::long_w,true>) b; }
+  inline ac_float( unsigned long b ) { *this = (ac_int<ac_private::long_w,false>) b; }
+  inline ac_float( Slong b ) { *this = (ac_int<64,true>) b; }
+  inline ac_float( Ulong b ) { *this = (ac_int<64,false>) b; }
+
+  // Explicit conversion functions to ac_int and ac_fixed
+  inline typename rt_unary::to_ac_fixed_t to_ac_fixed() const {
+    typename rt_unary::to_ac_fixed_t r = m;
+    r <<= e;
+    return r;
+  }
+  inline typename rt_unary::to_ac_int_t to_ac_int() const {
+    return to_ac_fixed().to_ac_int();
+  }
+
+  // Explicit conversion functions to C built-in types -------------
+  inline int to_int() const { return to_ac_int().to_int(); }
+  inline unsigned to_uint() const { return to_ac_int().to_uint(); }
+  inline long to_long() const { return (signed long) to_ac_int().to_int64(); }
+  inline unsigned long to_ulong() const { return (unsigned long) to_ac_int().to_uint64(); }
+  inline Slong to_int64() const { return to_ac_int().to_int64(); }
+  inline Ulong to_uint64() const { return to_ac_int().to_uint64(); }
+  inline float to_float() const { return ldexpf(m.to_double(), exp()); }
+  inline double to_double() const { return ldexp(m.to_double(), exp()); }
+
+  const ac_fixed<W,I,S> mantissa() const { return m; }
+  const ac_int<E,true> exp() const { return e; }
+  bool normalize() {
+    bool all_sign;
+    int ls = m.leading_sign(all_sign);
+    bool m_zero = all_sign & !m[0];
+    const int max_shift_left = (1 << (E-1)) + e;
+    bool normal = ls <= max_shift_left;
+    int shift_l = normal ? ls : max_shift_left;
+    m <<= shift_l;
+    e = ac_int<1,true>(!m_zero) & (e - shift_l);
+    return normal;
+  }
+
+  ac_float( double d, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
+    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cdouble_t::width + I_EXT - 1,  };
+    ac_private::ac_float_cdouble_t t = ac_private::double_to_ac_float(d);
+    ac_float r(t, assert_on_overflow, assert_on_rounding);
+    *this = r;
+  }
+
+  ac_float( float f, bool assert_on_overflow=false, bool assert_on_rounding=false ) {
+    enum { I_EXT = AC_MAX(I,1), W_EXT = ac_private::ac_float_cfloat_t::width + I_EXT - 1,  };
+    ac_private::ac_float_cfloat_t t = ac_private::float_to_ac_float(f);
+    ac_float r(t, assert_on_overflow, assert_on_rounding);
+    *this = r;
+  }
+
+  template<AC_FL_T(2)>
+  bool compare(const AC_FL(2) &op2, bool *gt) const {
+    typedef ac_fixed<W2,I,S2> fx2_t;
+    typedef typename ac_fixed<W,I,S>::template rt_T< fx2_t >::logic fx_t;
+    typedef ac_fixed<fx_t::width,fx_t::i_width,false> fxu_t;
+
+    fx2_t op2_m_0;
+    op2_m_0.set_slc(0, op2.m.template slc<W2>(0));
+
+    fx_t op1_m = m;
+    fx_t op2_m = op2_m_0;
+    int e_dif = exp() - op2.exp() + I - I2;
+    bool op2_m_neg = op2_m[fx_t::width-1];
+    fx_t out_bits = op2_m ^ ((op2_m_neg & e_dif < 0) ? ~fx_t(0) : fx_t(0));
+    out_bits &= ~(fxu_t(~fxu_t(0)) << e_dif);
+    op2_m >>= e_dif;
+    bool overflow = e_dif < 0 & !!out_bits | op2_m_neg ^ op2_m[fx_t::width-1];
+
+    *gt = overflow & op2_m_neg | !overflow & op1_m > op2_m;
+    bool eq = op1_m == op2_m & !overflow & !out_bits;
+    return eq;
+  }
+
+  template<AC_FL_T(2), AC_FL_T(R)>
+  void plus_minus(const AC_FL(2) &op2, AC_FL(R) &r, bool sub=false) const {
+    typedef AC_FL(2) op2_t;
+    enum { IT = AC_MAX(I,I2) };
+    typedef ac_fixed<W, IT, true> fx1_t;
+    typedef ac_fixed<W2, IT, true> fx2_t;
+    // covers fx1_t and r mantissas (adds additional LSBs if WR > W)
+    typedef typename fx1_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx1r_t;
+    // covers fx2_t and r mantissas (adds additional LSBs if WR > W2)
+    typedef typename fx2_t::template rt_T< ac_fixed<WR,IT,SR> >::logic fx2r_t;
+    // mt_t adds one integer bit for the plus
+    //  op1_m, op2_m, op_sl, sticky_bits
+    typedef typename fx1r_t::template rt_T<fx2r_t>::plus mt_t;
+
+    const bool round_bit_needed = QR != AC_TRN;
+    const bool remaining_bits_needed = !(QR == AC_TRN || QR == AC_RND);
+
+    const int w_r_with_round_bits = WR + round_bit_needed;
+
+    // naming: sn = subnormal, n = normal, wc = worst case
+    // worst case (wc) normalize is when one operand has smallest subnormal
+    //   and other operand is shifted right so that its MSB lines up with LSB of subnormal
+    const int power_smallest_sn1 = I - W - (1 << (E-1));
+    const int power_smallest_sn2 = I2 - W2 - (1 << (E2-1));
+    const int power_smallest_sn_dif1 = AC_MAX(0,power_smallest_sn2 - power_smallest_sn1);
+    const int power_smallest_sn_dif2 = AC_MAX(0,power_smallest_sn1 - power_smallest_sn2);
+    const int wc_norm_shift1 = W2-1 + AC_MIN(power_smallest_sn_dif1, W-1);
+    const int wc_norm_shift2 = W-1 + AC_MIN(power_smallest_sn_dif2, W2-1);
+    const int wc_sn_norm_shift = AC_MAX(wc_norm_shift1, wc_norm_shift2);
+    const int w_sn_overlap = wc_sn_norm_shift + 1;
+
+    // cases when one operand is subnormal and other is shifted right and does not overlap bits
+    //   subnormal op could be normalized by width-1 bits
+    const int w_sn_no_overlap1 = W + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif2);
+    const int w_sn_no_overlap2 = W2 + AC_MIN(w_r_with_round_bits, power_smallest_sn_dif1);
+    const int w_sn_no_overlap = AC_MAX(w_sn_no_overlap1, w_sn_no_overlap2);
+
+    const int w_sn = AC_MAX(w_sn_overlap, w_sn_no_overlap);
+
+    // For example 0100 + (1000 0001 >> 1) = 0000 0000 1,  wc_n_norm_shift = max(4,8)
+    const int msb0h1 = I-1 + (int) MAX_EXP;
+    const int msb1h1 = msb0h1-1;
+    const int msb0l1 = I-1 + (int) MIN_EXP;
+    const int msb1l1 = msb0h1-1;
+    const int msb0h2 = I2-1 + (int) op2_t::MAX_EXP;
+    const int msb1h2 = msb0h2-1;
+    const int msb0l2 = I2-1 + (int) op2_t::MIN_EXP;
+    const int msb1l2 = msb0h2-1;
+    // bit W-1 overlap with bit W2-2
+    const bool msb_overlap1 = msb1h2 >= msb0h1 && msb0h1 <= msb1l2
+      || msb1h2 >= msb0l1 && msb0l1 <= msb1l2
+      || msb0h1 >= msb1h2 && msb1h2 >= msb0l1;
+    // bit W2-1 overlap with bit W1-2
+    const bool msb_overlap2 = msb1h1 >= msb0h2 && msb0h2 <= msb1l1
+      || msb1h1 >= msb0l2 && msb0l2 <= msb1l1
+      || msb0h2 >= msb1h1 && msb1h1 >= msb0l2;
+    const bool msb_overlap = msb_overlap1 || msb_overlap2;
+    const int wc_n_norm_shift = AC_MAX(W,W2);
+    const int w_n_msb_overlap = msb_overlap ? wc_n_norm_shift + 1 : 0;
+    // addition of two numbers of different sign can result in a normalization by 1 (therefore + 1)
+    const int w_n_no_msb_overlap = w_r_with_round_bits + 1;
+    const int w_n = AC_MAX(w_n_msb_overlap, w_n_no_msb_overlap);
+
+    // +1 is to prevent overflow during addition
+    const int tr_t_width = AC_MAX(w_n, w_sn) + 1;
+    typedef ac_fixed<tr_t_width,IT+1,true> add_t;
+
+    const int min_E = (int) MIN_EXP + I-IT;
+    const int min_E2 = (int) AC_FL(2)::MIN_EXP + I2-IT;
+    const int min_ET = AC_MIN(min_E, min_E2);
+
+    const int max_E = (int) MAX_EXP + I-IT;
+    const int max_E2 = (int) AC_FL(2)::MAX_EXP + I2-IT;
+    const int max_ET = AC_MAX(max_E, max_E2);
+
+    ac_fixed<mt_t::width, I+1, mt_t::sign> op1_m_0 = m;
+    mt_t op1_m = 0;
+    op1_m.set_slc(0, op1_m_0.template slc<mt_t::width>(0));
+    int op1_e = exp() + I-IT;
+
+    ac_fixed<mt_t::width, I2+1, mt_t::sign> op2_m_0 = op2.m;
+    mt_t op2_m = 0;
+    op2_m.set_slc(0, op2_m_0.template slc<mt_t::width>(0));
+    if(sub)
+      op2_m = -op2_m;
+    int op2_e = op2.exp() + I2-IT;
+
+    bool op1_zero = operator !();
+    bool op2_zero = !op2;
+    int e_dif = op1_e - op2_e;
+    bool e1_lt_e2 = e_dif < 0;
+    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
+
+    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
+    mt_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
+
+    bool sticky_bit = false;
+    if(remaining_bits_needed) {
+      mt_t shifted_out_bits = op_lshift;
+      // bits that are shifted out of a add_t (does not include potential 3 spare bits)
+      shifted_out_bits &= ~((~add_t(0)) << e_dif);
+      sticky_bit = !!shifted_out_bits;
+    }
+    op_lshift >>= e_dif;
+
+    add_t add_r = op_lshift + op_no_shift;
+    int e_t = (e1_lt_e2 & !op2_zero | op1_zero ? op2_e : op1_e);
+
+    r.template assign_from<min_ET,max_ET>(add_r, e_t, sticky_bit, true);
+  }
+
+  template<AC_FL_T(1), AC_FL_T(2)>
+  ac_float add(const AC_FL(1) &op1, const AC_FL(2) &op2) {
+    op1.plus_minus(op2, *this);
+    return *this;
+  }
+
+  template<AC_FL_T(1), AC_FL_T(2)>
+  ac_float sub(const AC_FL(1) &op1, const AC_FL(2) &op2) {
+    op1.plus_minus(op2, *this, true);
+    return *this;
+  }
+
+  typename rt_unary::neg abs() const {
+    typedef typename rt_unary::neg r_t;
+    r_t r;
+    r.m = is_neg() ? -m : r_t::mant_t(m);
+    r.e = e;
+    return r;
+  }
+
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+  // These will be changed!!! For now only enable to explore integration with ac_complex
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::plus operator +(const AC_FL(2) &op2) const {
+    typename rt< AC_FL_TV0(2) >::plus r;
+    plus_minus(op2, r);
+    return r;
+  }
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::minus operator -(const AC_FL(2) &op2) const {
+    typename rt< AC_FL_TV0(2) >::minus r;
+    plus_minus(op2, r, true);
+    return r;
+  }
+#endif
+
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::mult operator *(const AC_FL(2) &op2) const {
+    typedef typename rt< AC_FL_TV0(2) >::mult r_t;
+    r_t r(m*op2.m, exp()+op2.exp(), false);
+    return r;
+  }
+
+  template<AC_FL_T(2)>
+  typename rt< AC_FL_TV0(2) >::div operator /(const AC_FL(2) &op2) const {
+    typename rt< AC_FL_TV0(2) >::div r(m/op2.m, exp()-op2.exp());
+    return r;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator +=(const AC_FL(2) &op2) {
+    ac_float r;
+    plus_minus(op2, r);
+    *this = r;
+    return *this;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator -=(const AC_FL(2) &op2) {
+    ac_float r;
+    plus_minus(op2, r, true);
+    *this = r;
+    return *this;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator *=(const AC_FL(2) &op2) {
+    *this = *this * op2;
+    return *this;
+  }
+  template<AC_FL_T(2)>
+  ac_float &operator /=(const AC_FL(2) &op2) {
+    *this = *this / op2;
+    return *this;
+  }
+  ac_float operator + () const {
+    return *this;
+  }
+  typename rt_unary::neg operator - () const {
+    typename rt_unary::neg r;
+    r.m = -m;
+    r.e = e;
+    return r;
+  }
+  bool operator ! () const {
+    return !m;
+  }
+
+  // Shift --------------------------------------------------------------------
+  template<int WI, bool SI>
+  typename rt_i<WI,SI>::lshift operator << ( const ac_int<WI,SI> &op2 ) const {
+    typename rt_i<WI,SI>::lshift r;
+    r.m = m;
+    r.e = e + op2;
+    return r;
+  }
+  template<int WI, bool SI>
+  typename rt_i<WI,SI>::rshift operator >> ( const ac_int<WI,SI> &op2 ) const {
+    typename rt_i<WI,SI>::rshift r;
+    r.m = m;
+    r.e = e - op2;
+    return r;
+  }
+  // Shift assign -------------------------------------------------------------
+  template<int WI, bool SI>
+  ac_float &operator <<= ( const ac_int<WI,SI> &op2 ) {
+    *this = operator << (op2);
+    return *this;
+  }
+  template<int WI, bool SI>
+  ac_float &operator >>= ( const ac_int<WI,SI> &op2 ) {
+    *this = operator >> (op2);
+    return *this;
+  }
+
+  template<AC_FL_T(2)>
+  bool operator == (const AC_FL(2) &f) const {
+    bool gt;
+    return compare(f, &gt);
+  }
+  template<AC_FL_T(2)>
+  bool operator != (const AC_FL(2) &f) const {
+    return !operator == (f);
+  }
+  template<AC_FL_T(2)>
+  bool operator < (const AC_FL(2) &f) const {
+    bool gt;
+    bool eq = compare(f, &gt);
+    return !(eq | gt);
+  }
+  template<AC_FL_T(2)>
+  bool operator >= (const AC_FL(2) &f) const {
+    return !operator < (f);
+  }
+  template<AC_FL_T(2)>
+  bool operator > (const AC_FL(2) &f) const {
+    bool gt;
+    compare(f, &gt);
+    return gt;
+  }
+  template<AC_FL_T(2)>
+  bool operator <= (const AC_FL(2) &f) const {
+    return !operator > (f);
+  }
+
+  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false, bool hw=true) const {
+    // TODO: printing decimal with exponent
+    if(!hw) {
+      ac_fixed<W,0,S> mantissa;
+      mantissa.set_slc(0, m.template slc<W>(0));
+      std::string r = mantissa.to_string(base_rep, sign_mag);
+      r += "e2";
+      r += (e + I).to_string(base_rep, sign_mag | base_rep == AC_DEC);
+      return r;
+    } else {
+      std::string r = m.to_string(base_rep, sign_mag);
+      if(base_rep != AC_DEC)
+        r += "_";
+      r += "e2";
+      if(base_rep != AC_DEC)
+        r += "_";
+      if(E)
+        r += e.to_string(base_rep, sign_mag | base_rep == AC_DEC);
+      else
+        r += "0";
+      return r;
+    }
+  }
+
+  inline static std::string type_name() {
+    const char *tf[] = {"false", "true" };
+    const char *q[] = {"AC_TRN", "AC_RND", "AC_TRN_ZERO", "AC_RND_ZERO", "AC_RND_INF", "AC_RND_MIN_INF", "AC_RND_CONV" };
+    std::string r = "ac_float<";
+    r += ac_int<32,true>(W).to_string(AC_DEC) + ',';
+    r += ac_int<32,true>(I).to_string(AC_DEC) + ',';
+    r += ac_int<32,true>(E).to_string(AC_DEC) + ',';
+    r += tf[S];
+    r += ',';
+    r += q[Q];
+    r += '>';
+    return r;
+  }
+
+  template<ac_special_val V>
+  inline ac_float &set_val() {
+    m.template set_val<V>();
+    if(V == AC_VAL_MIN)
+      e.template set_val<AC_VAL_MAX>();
+    else if(V == AC_VAL_QUANTUM)
+      e.template set_val<AC_VAL_MIN>();
+    else
+      e.template set_val<V>();
+    return *this;
+  }
+};
+
+namespace ac_private {
+  template<typename T>
+  bool ac_fpclassify(T x, bool &inf) {
+    bool nan = !(x==x);
+    if(!nan) {
+      T d = x - x;
+      inf = !(d==d);
+    }
+    return nan;
+  }
+
+  inline ac_float_cdouble_t double_to_ac_float(double d) {
+    typedef ac_float_cdouble_t r_t;
+#ifndef __SYNTHESIS__
+    bool inf;
+    bool nan = ac_fpclassify(d, inf);
+    if(nan)
+      AC_ASSERT(0, "In conversion from double to ac_float: double is NaN");
+    else if(inf)
+      AC_ASSERT(0, "In conversion from double to ac_float: double is Infinite");
+#endif
+    r_t::exp_t exp;
+    r_t::mant_t mant = ac::frexp_d(d, exp);
+    return r_t(mant, exp, false);
+  }
+
+  inline ac_float_cfloat_t float_to_ac_float(float f) {
+    typedef ac_float_cfloat_t r_t;
+#ifndef __SYNTHESIS__
+    bool inf;
+    bool nan = ac_fpclassify(f, inf);
+    if(nan)
+      AC_ASSERT(0, "In conversion from float to ac_float: float is NaN");
+    else if(inf)
+      AC_ASSERT(0, "In conversion from float to ac_float: float is Infinite");
+#endif
+    r_t::exp_t exp;
+    r_t::mant_t mant = ac::frexp_f(f, exp);
+    return r_t(mant, exp, false);
+  }
+};
+
+namespace ac {
+  template<typename T>
+  struct ac_float_represent {
+    typedef typename ac_fixed_represent<T>::type fx_t;
+    typedef ac_float<fx_t::width+!fx_t::sign,fx_t::i_width+!fx_t::sign,1,fx_t::q_mode> type;
+  };
+  template<> struct ac_float_represent<float> {
+    typedef ac_private::ac_float_cfloat_t type;
+  };
+  template<> struct ac_float_represent<double> {
+    typedef ac_private::ac_float_cdouble_t type;
+  };
+}
+
+namespace ac_private {
+  // with T == ac_float
+  template< AC_FL_T0(2) >
+  struct rt_ac_float_T< AC_FL0(2) > {
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+  // with T == ac_fixed
+  template<int WFX, int IFX, bool SFX>
+  struct rt_ac_float_T< ac_fixed<WFX,IFX,SFX> > {
+    // For now E2 > 0
+    enum { E2 = 1, S2 = true, W2 = WFX + !SFX, I2 = IFX + !SFX };
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+  // with T == ac_int
+  template<int WI, bool SI>
+  struct rt_ac_float_T< ac_int<WI,SI> > {
+    // For now E2 > 0
+    enum { E2 = 1, S2 = true, I2 = WI + !SI, W2 = I2 };
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+
+  // Multiplication is optimizable, general operator +/- is not yet supported
+  template<typename T>
+  struct rt_ac_float_T< c_type<T> > {
+    // For now E2 > 0
+    enum { SCT = c_type_params<T>::S, S2 = true, W2 = c_type_params<T>::W + !SCT, I2 = c_type_params<T>::I + !SCT, E2 = AC_MAX(1, c_type_params<T>::E) };
+    typedef AC_FL0(2) fl2_t;
+    template< AC_FL_T0() >
+    struct op1 {
+      typedef AC_FL0() fl_t;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::mult mult;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::plus plus;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::minus minus;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::minus minus2;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::logic logic;
+      typedef typename fl_t::template rt< AC_FL_TV0(2) >::div div;
+      typedef typename fl2_t::template rt< AC_FL_TV0() >::div div2;
+    };
+  };
+}
+
+// Stream --------------------------------------------------------------------
+
+#ifndef __SYNTHESIS__
+template<AC_FL_T()>
+inline std::ostream& operator << (std::ostream &os, const AC_FL() &x) {
+  os << x.to_string(AC_DEC);
+  return os;
+}
+#endif
+
+#define FL_BIN_OP_WITH_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
+  template< AC_FL_T() > \
+  inline typename AC_FL()::template rt_T2<C_TYPE>::RTYPE operator BIN_OP ( C_TYPE c_op, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return fl2_t(c_op).operator BIN_OP (op);  \
+  } \
+  template< AC_FL_T() > \
+  inline typename AC_FL()::template rt_T<C_TYPE>::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE c_op) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return op.operator BIN_OP (fl2_t(c_op));  \
+  }
+
+#define FL_REL_OP_WITH_CTYPE(REL_OP, C_TYPE)  \
+  template< AC_FL_T() > \
+  inline bool operator REL_OP ( const AC_FL() &op, C_TYPE op2) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return op.operator REL_OP (fl2_t(op2));  \
+  }  \
+  template< AC_FL_T() > \
+  inline bool operator REL_OP ( C_TYPE op2, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return fl2_t(op2).operator REL_OP (op);  \
+  }
+
+#define FL_ASSIGN_OP_WITH_CTYPE_2(ASSIGN_OP, C_TYPE)  \
+  template< AC_FL_T() > \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE op2) {  \
+    typedef typename ac::template ac_float_represent<C_TYPE>::type fl2_t; \
+    return op.operator ASSIGN_OP (fl2_t(op2));  \
+  }
+
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
+  FL_BIN_OP_WITH_CTYPE(+, C_TYPE, plus) \
+  FL_BIN_OP_WITH_CTYPE(-, C_TYPE, minus)
+#else
+#define FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE)
+#endif
+
+#define FL_OPS_WITH_CTYPE(C_TYPE) \
+  FL_BIN_OP_WITH_CTYPE_ALPHA(C_TYPE) \
+  FL_BIN_OP_WITH_CTYPE(*, C_TYPE, mult) \
+  FL_BIN_OP_WITH_CTYPE(/, C_TYPE, div) \
+  \
+  FL_REL_OP_WITH_CTYPE(==, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(!=, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(>, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(>=, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(<, C_TYPE) \
+  FL_REL_OP_WITH_CTYPE(<=, C_TYPE) \
+  \
+  FL_ASSIGN_OP_WITH_CTYPE_2(+=, C_TYPE) \
+  FL_ASSIGN_OP_WITH_CTYPE_2(-=, C_TYPE) \
+  FL_ASSIGN_OP_WITH_CTYPE_2(*=, C_TYPE) \
+  FL_ASSIGN_OP_WITH_CTYPE_2(/=, C_TYPE)
+
+#define FL_SHIFT_OP_WITH_INT_CTYPE(BIN_OP, C_TYPE, RTYPE)  \
+  template< AC_FL_T() > \
+  inline typename AC_FL()::template rt_i< ac_private::c_type_params<C_TYPE>::W, ac_private::c_type_params<C_TYPE>::S >::RTYPE operator BIN_OP ( const AC_FL() &op, C_TYPE i_op) {  \
+    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
+    return op.operator BIN_OP (i_t(i_op));  \
+  }
+
+#define FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(ASSIGN_OP, C_TYPE)  \
+  template< AC_FL_T() > \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, C_TYPE i_op) {  \
+    typedef typename ac::template ac_int_represent<C_TYPE>::type i_t; \
+    return op.operator ASSIGN_OP (i_t(i_op));  \
+  }
+
+#define FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE) \
+  FL_SHIFT_OP_WITH_INT_CTYPE(>>, C_TYPE, rshift) \
+  FL_SHIFT_OP_WITH_INT_CTYPE(<<, C_TYPE, lshift) \
+  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(>>=, C_TYPE) \
+  FL_SHIFT_ASSIGN_OP_WITH_INT_CTYPE(<<=, C_TYPE)
+
+#define FL_OPS_WITH_INT_CTYPE(C_TYPE) \
+  FL_OPS_WITH_CTYPE(C_TYPE) \
+  FL_SHIFT_OPS_WITH_INT_CTYPE(C_TYPE)
+
+// --------------------------------------- End of Macros for Binary Operators with C Floats
+
+    // Binary Operators with C Floats --------------------------------------------
+    FL_OPS_WITH_CTYPE(float)
+    FL_OPS_WITH_CTYPE(double)
+    FL_OPS_WITH_INT_CTYPE(bool)
+    FL_OPS_WITH_INT_CTYPE(char)
+    FL_OPS_WITH_INT_CTYPE(signed char)
+    FL_OPS_WITH_INT_CTYPE(unsigned char)
+    FL_OPS_WITH_INT_CTYPE(short)
+    FL_OPS_WITH_INT_CTYPE(unsigned short)
+    FL_OPS_WITH_INT_CTYPE(int)
+    FL_OPS_WITH_INT_CTYPE(unsigned int)
+    FL_OPS_WITH_INT_CTYPE(long)
+    FL_OPS_WITH_INT_CTYPE(unsigned long)
+    FL_OPS_WITH_INT_CTYPE(Slong)
+    FL_OPS_WITH_INT_CTYPE(Ulong)
+    // -------------------------------------- End of Binary Operators with C Floats
+
+// Macros for Binary Operators with ac_int --------------------------------------------
+
+#define FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const ac_int<WI,SI> &i_op, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return fl2_t(i_op).operator BIN_OP (op);  \
+  }
+
+#define FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline typename AC_FL()::template rt_T2< ac_int<WI,SI> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_int<WI,SI> &i_op) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return op.operator BIN_OP (fl2_t(i_op));  \
+  }
+
+#define FL_BIN_OP_WITH_AC_INT(BIN_OP, RTYPE)  \
+  FL_BIN_OP_WITH_AC_INT_1(BIN_OP, RTYPE) \
+  FL_BIN_OP_WITH_AC_INT_2(BIN_OP, RTYPE)
+
+#define FL_REL_OP_WITH_AC_INT(REL_OP)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline bool operator REL_OP ( const AC_FL() &op, const ac_int<WI,SI> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return op.operator REL_OP (fl2_t(op2));  \
+  }  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline bool operator REL_OP ( ac_int<WI,SI> &op2, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return fl2_t(op2).operator REL_OP (op);  \
+  }
+
+#define FL_ASSIGN_OP_WITH_AC_INT(ASSIGN_OP)  \
+  template< AC_FL_T(), int WI, bool SI> \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_int<WI,SI> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_int<WI,SI> >::type fl2_t; \
+    return op.operator ASSIGN_OP (fl2_t(op2));  \
+  }
+
+// -------------------------------------------- End of Macros for Binary Operators with ac_int
+
+    // Binary Operators with ac_int --------------------------------------------
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+    FL_BIN_OP_WITH_AC_INT(+, plus)
+    FL_BIN_OP_WITH_AC_INT(-, minus)
+#endif
+    FL_BIN_OP_WITH_AC_INT(*, mult)
+    FL_BIN_OP_WITH_AC_INT(/, div)
+
+    FL_REL_OP_WITH_AC_INT(==)
+    FL_REL_OP_WITH_AC_INT(!=)
+    FL_REL_OP_WITH_AC_INT(>)
+    FL_REL_OP_WITH_AC_INT(>=)
+    FL_REL_OP_WITH_AC_INT(<)
+    FL_REL_OP_WITH_AC_INT(<=)
+
+    FL_ASSIGN_OP_WITH_AC_INT(+=)
+    FL_ASSIGN_OP_WITH_AC_INT(-=)
+    FL_ASSIGN_OP_WITH_AC_INT(*=)
+    FL_ASSIGN_OP_WITH_AC_INT(/=)
+    FL_ASSIGN_OP_WITH_AC_INT(%=)
+    // -------------------------------------- End of Binary Operators with ac_int
+
+// Macros for Binary Operators with ac_fixed --------------------------------------------
+
+#define FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const ac_fixed<WF,IF,SF,QF,OF> &f_op, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return fl2_t(f_op).operator BIN_OP (op);  \
+  }
+
+#define FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline typename AC_FL()::template rt_T2< ac_fixed<WF,IF,SF> >::RTYPE operator BIN_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &f_op) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return op.operator BIN_OP (fl2_t(f_op));  \
+  }
+
+#define FL_BIN_OP_WITH_AC_FIXED(BIN_OP, RTYPE)  \
+  FL_BIN_OP_WITH_AC_FIXED_1(BIN_OP, RTYPE) \
+  FL_BIN_OP_WITH_AC_FIXED_2(BIN_OP, RTYPE)
+
+#define FL_REL_OP_WITH_AC_FIXED(REL_OP)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline bool operator REL_OP ( const AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return op.operator REL_OP (fl2_t(op2));  \
+  }  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline bool operator REL_OP ( ac_fixed<WF,IF,SF,QF,OF> &op2, const AC_FL() &op) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return fl2_t(op2).operator REL_OP (op);  \
+  }
+
+#define FL_ASSIGN_OP_WITH_AC_FIXED(ASSIGN_OP)  \
+  template< AC_FL_T(), int WF, int IF, bool SF, ac_q_mode QF, ac_o_mode OF> \
+  inline AC_FL() &operator ASSIGN_OP ( AC_FL() &op, const ac_fixed<WF,IF,SF,QF,OF> &op2) {  \
+    typedef typename ac::template ac_float_represent< ac_fixed<WF,IF,SF> >::type fl2_t; \
+    return op.operator ASSIGN_OP (fl2_t(op2));  \
+  }
+
+// -------------------------------------------- End of Macros for Binary Operators with ac_fixed
+
+    // Binary Operators with ac_fixed --------------------------------------------
+#ifdef __AC_FLOAT_ENABLE_ALPHA
+    FL_BIN_OP_WITH_AC_FIXED(+, plus)
+    FL_BIN_OP_WITH_AC_FIXED(-, minus)
+#endif
+    FL_BIN_OP_WITH_AC_FIXED(*, mult)
+    FL_BIN_OP_WITH_AC_FIXED(/, div)
+
+    FL_REL_OP_WITH_AC_FIXED(==)
+    FL_REL_OP_WITH_AC_FIXED(!=)
+    FL_REL_OP_WITH_AC_FIXED(>)
+    FL_REL_OP_WITH_AC_FIXED(>=)
+    FL_REL_OP_WITH_AC_FIXED(<)
+    FL_REL_OP_WITH_AC_FIXED(<=)
+
+    FL_ASSIGN_OP_WITH_AC_FIXED(+=)
+    FL_ASSIGN_OP_WITH_AC_FIXED(-=)
+    FL_ASSIGN_OP_WITH_AC_FIXED(*=)
+    FL_ASSIGN_OP_WITH_AC_FIXED(/=)
+    // -------------------------------------- End of Binary Operators with ac_fixed
+
+// Global templatized functions for easy initialization to special values
+template<ac_special_val V, AC_FL_T()>
+inline AC_FL() value( AC_FL() ) {
+  AC_FL() r;
+  return r.template set_val<V>();
+}
+
+namespace ac {
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, AC_FL_T() >
+  inline bool init_array( AC_FL() *a, int n) {
+    AC_FL0() t;
+    t.template set_val<V>();
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_FLOAT_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_int.h b/hls4ml/templates/quartus/ac_types/ac_int.h
index bb9542642e..4651339169 100644
--- a/hls4ml/templates/quartus/ac_types/ac_int.h
+++ b/hls4ml/templates/quartus/ac_types/ac_int.h
@@ -1,3099 +1,3099 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*
-//  Source:         ac_int.h
-//  Description:    fast arbitrary-length bit-accurate integer types:
-//                    - unsigned integer of length W:  ac_int<W,false>
-//                    - signed integer of length W:  ac_int<W,true>
-//  Author:         Andres Takach, Ph.D.
-//  Notes:
-//   - C++ Runtime: important to use optimization flag (for example -O3)
-//
-//   - Compiler support: recent GNU compilers are required for correct
-//     template compilation
-//
-//   - Most frequent migration issues:
-//      - need to cast to common type when using question mark operator:
-//          (a < 0) ? -a : a;  // a is ac_int<W,true>
-//        change to:
-//          (a < 0) ? -a : (ac_int<W+1,true>) a;
-//        or
-//          (a < 0) ? (ac_int<W+1,false>) -a : (ac_int<W+1,false>) a;
-//
-//      - left shift is not arithmetic ("a<<n" has same bitwidth as "a")
-//          ac_int<W+1,false> b = a << 1;  // a is ac_int<W,false>
-//        is not equivalent to b=2*a. In order to get 2*a behavior change to:
-//          ac_int<W+1,false> b = (ac_int<W+1,false>)a << 1;
-//
-//      - only static length read/write slices are supported:
-//         - read:  x.slc<4>(k) => returns ac_int for 4-bit slice x(4+k-1 DOWNTO k)
-//         - write: x.set_slc(k,y) = writes bits of y to x starting at index k
-*/
-
-#ifndef __AC_INT_H
-#define __AC_INT_H
-
-#define AC_VERSION 3
-#define AC_VERSION_MINOR 9
-
-#ifndef __cplusplus
-#error C++ is required to include this header file
-#endif
-
-#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
-#error GCC version 3 or greater is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
-#error Microsoft Visual Studio 8 or newer is required to include this header file
-#endif
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4127 4100 4244 4307 4310 4365 4514 4554 4706 4800 )
-#endif
-
-// for safety
-#if (defined(N) || defined(N2))
-#error One or more of the following is defined: N, N2. Definition conflicts with their usage as template parameters.
-#error DO NOT use defines before including third party header files.
-#endif
-
-// for safety
-#if (defined(W) || defined(I) || defined(S) || defined(W2) || defined(I2) || defined(S2))
-#error One or more of the following is defined: W, I, S, W2, I2, S2. Definition conflicts with their usage as template parameters.
-#error DO NOT use defines before including third party header files.
-#endif
-
-#if defined(true)
-#warning The C++ keyword true is defined which may result in subtle compilation problems. Undefining it.
-#undef true
-#endif
-#if defined(false)
-#warning The C++ keyword false is defined which may result in subtle compilation problems. Undefining it.
-#undef false
-#endif
-
-#ifndef __ASSERT_H__
-#define __ASSERT_H__
-#include <assert.h>
-#endif
-#include <limits>
-#ifndef AC_USER_DEFINED_ASSERT
-#include <iostream>
-#else
-#include <ostream>
-#endif
-#include <math.h>
-#include <string>
-
-#ifndef __SYNTHESIS__
-#ifndef __AC_INT_UTILITY_BASE
-#define __AC_INT_UTILITY_BASE
-#endif
-
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-#define AC_MAX(a,b) ((a) > (b) ? (a) : (b))
-#define AC_MIN(a,b) ((a) < (b) ? (a) : (b))
-#define AC_ABS(a) ((a) < 0 ? -(a) : (a))
-
-#if defined(_MSC_VER)
-typedef unsigned __int64 Ulong;
-typedef signed   __int64 Slong;
-#else
-typedef unsigned long long Ulong;
-typedef signed   long long Slong;
-#endif
-
-enum ac_base_mode { AC_BIN=2, AC_OCT=8, AC_DEC=10, AC_HEX=16 };
-enum ac_special_val {AC_VAL_DC, AC_VAL_0, AC_VAL_MIN, AC_VAL_MAX, AC_VAL_QUANTUM};
-
-template <int W, bool S> class ac_int;
-
-namespace ac_private {
-#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-#pragma builtin
-#endif
-
-  enum {long_w = std::numeric_limits<unsigned long>::digits};
-  const unsigned int all_ones = (unsigned) ~0;
-
-  // PRIVATE FUNCTIONS in namespace: for implementing ac_int/ac_fixed
-
-#ifndef __SYNTHESIS__
-  inline double mgc_floor(double d) { return floor(d); }
-#else
-  inline double mgc_floor(double d) { return 0.0; }
-#endif
-
-  #define AC_ASSERT(cond, msg) ac_private::ac_assert(cond, __FILE__, __LINE__, msg)
-  inline void ac_assert(bool condition, const char *file=0, int line=0, const char *msg=0) {
-  #ifndef __SYNTHESIS__
-    #ifndef AC_USER_DEFINED_ASSERT
-    if(!condition) {
-      std::cerr << "Assert";
-      if(file)
-        std::cerr << " in file " << file << ":" << line;
-      if(msg)
-        std::cerr << " " << msg;
-      std::cerr << std::endl;
-      assert(0);
-    }
-    #else
-    AC_USER_DEFINED_ASSERT(condition, file, line, msg);
-    #endif
-  #endif
-  }
-
-  // helper structs for statically computing log2 like functions (nbits, log2_floor, log2_ceil)
-  //   using recursive templates
-  template<unsigned char N>
-  struct s_N {
-    template<unsigned X>
-    struct s_X {
-      enum {
-        X2 = X >> N,
-        N_div_2 = N >> 1,
-        nbits = X ? (X2 ? N + (int) s_N<N_div_2>::template s_X<X2>::nbits : (int) s_N<N_div_2>::template s_X<X>::nbits) : 0
-      };
-    };
-  };
-  template<> struct s_N<0> {
-    template<unsigned X>
-    struct s_X {
-      enum {nbits = !!X };
-    };
-  };
-
-  template<int N>
-  inline double ldexpr32(double d) {
-    double d2 = d;
-    if(N < 0)
-      for(int i=0; i < -N; i++)
-        d2 /= (Ulong) 1 << 32;
-    else
-      for(int i=0; i < N; i++)
-        d2 *= (Ulong) 1 << 32;
-    return d2;
-  }
-  template<> inline double ldexpr32<0>(double d) { return d; }
-  template<> inline double ldexpr32<1>(double d) { return d * ((Ulong) 1 << 32); }
-  template<> inline double ldexpr32<-1>(double d) { return d / ((Ulong) 1 << 32); }
-  template<> inline double ldexpr32<2>(double d) { return (d * ((Ulong) 1 << 32)) * ((Ulong) 1 << 32); }
-  template<> inline double ldexpr32<-2>(double d) { return (d / ((Ulong) 1 << 32)) / ((Ulong) 1 << 32); }
-
-  template<int N>
-  inline double ldexpr(double d) {
-    return ldexpr32<N/32>( N < 0 ? d/( (unsigned) 1 << (-N & 31)) : d * ( (unsigned) 1 << (N & 31)));
-  }
-
-  template<int N>
-  inline void iv_copy(const int *op, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op[i];
-  }
-  template<> inline void iv_copy<1>(const int *op, int *r) {
-    r[0] = op[0];
-  }
-  template<> inline void iv_copy<2>(const int *op, int *r) {
-    r[0] = op[0];
-    r[1] = op[1];
-  }
-
-  template<int N>
-  inline bool iv_equal_zero(const int *op){
-    for(int i=0; i < N; i++)
-      if(op[i])
-        return false;
-    return true;
-  }
-  template<> inline bool iv_equal_zero<0>(const int * /*op*/) { return true; }
-  template<> inline bool iv_equal_zero<1>(const int *op) {
-    return !op[0];
-  }
-  template<> inline bool iv_equal_zero<2>(const int *op) {
-    return !(op[0] || op[1]);
-  }
-
-  template<int N>
-  inline bool iv_equal_ones(const int *op){
-    for(int i=0; i < N; i++)
-      if(~op[i])
-        return false;
-    return true;
-  }
-  template<> inline bool iv_equal_ones<0>(const int * /*op*/) { return true; }
-  template<> inline bool iv_equal_ones<1>(const int *op) {
-    return !~op[0];
-  }
-  template<> inline bool iv_equal_ones<2>(const int *op) {
-    return !(~op[0] || ~op[1]);
-  }
-
-  template<int N1, int N2>
-  inline bool iv_equal(const int *op1, const int *op2){
-    const int M1 = AC_MAX(N1,N2);
-    const int M2 = AC_MIN(N1,N2);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-    for(int i=0; i < M2; i++)
-      if(OP1[i] != OP2[i])
-        return false;
-    int ext = OP2[M2-1] < 0 ? ~0 : 0;
-    for(int i=M2; i < M1; i++)
-      if(OP1[i] != ext)
-        return false;
-    return true;
-  }
-  template<> inline bool iv_equal<1,1>(const int *op1, const int *op2) {
-    return op1[0] == op2[0];
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_ones_from(const int *op){
-    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] >> (B&31))))
-      return false;
-    return iv_equal_ones<N-(B+31)/32>(&op[(B+31)/32]);
-  }
-  template<> inline bool  iv_equal_ones_from<0,1>(const int *op){
-    return iv_equal_ones<1>(op);
-  }
-  template<> inline bool  iv_equal_ones_from<0,2>(const int *op){
-    return iv_equal_ones<2>(op);
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_zeros_from(const int *op){
-    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] >> (B&31))))
-      return false;
-    return iv_equal_zero<N-(B+31)/32>(&op[(B+31)/32]);
-  }
-  template<> inline bool  iv_equal_zeros_from<0,1>(const int *op){
-    return iv_equal_zero<1>(op);
-  }
-  template<> inline bool  iv_equal_zeros_from<0,2>(const int *op){
-    return iv_equal_zero<2>(op);
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_ones_to(const int *op){
-    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] | (all_ones << (B&31)))))
-      return false;
-    return iv_equal_ones<B/32>(op);
-  }
-  template<> inline bool  iv_equal_ones_to<0,1>(const int *op){
-    return iv_equal_ones<1>(op);
-  }
-  template<> inline bool  iv_equal_ones_to<0,2>(const int *op){
-    return iv_equal_ones<2>(op);
-  }
-
-  template<int B, int N>
-  inline bool iv_equal_zeros_to(const int *op){
-    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] & ~(all_ones << (B&31)))))
-      return false;
-    return iv_equal_zero<B/32>(op);
-  }
-  template<> inline bool  iv_equal_zeros_to<0,1>(const int *op){
-    return iv_equal_zero<1>(op);
-  }
-  template<> inline bool  iv_equal_zeros_to<0,2>(const int *op){
-    return iv_equal_zero<2>(op);
-  }
-
-  template<int N1, int N2, bool greater>
-  inline bool iv_compare(const int *op1, const int *op2){
-    const int M1 = AC_MAX(N1,N2);
-    const int M2 = AC_MIN(N1,N2);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-    const bool b = (N1 >= N2) == greater;
-    int ext = OP2[M2-1] < 0 ? ~0 : 0;
-    int i2 = M1 > M2 ? ext : OP2[M1-1];
-    if(OP1[M1-1] != i2)
-      return b ^ (OP1[M1-1] < i2);
-    for(int i=M1-2; i >= M2; i--) {
-      if((unsigned) OP1[i] != (unsigned) ext)
-        return b ^ ((unsigned) OP1[i] < (unsigned) ext);
-    }
-    for(int i=M2-1; i >= 0; i--) {
-      if((unsigned) OP1[i] != (unsigned) OP2[i])
-        return b ^ ((unsigned) OP1[i] < (unsigned) OP2[i]);
-    }
-    return false;
-  }
-  template<> inline bool iv_compare<1,1,true>(const int *op1, const int *op2) {
-    return op1[0] > op2[0];
-  }
-  template<> inline bool iv_compare<1,1,false>(const int *op1, const int *op2) {
-    return op1[0] < op2[0];
-  }
-
-  template<int N>
-  inline void iv_extend(int *r, int ext) {
-    for(int i=0; i < N; i++)
-      r[i] = ext;
-  }
-  template<> inline void iv_extend<-2>(int * /*r*/, int /*ext*/) { }
-  template<> inline void iv_extend<-1>(int * /*r*/, int /*ext*/) { }
-  template<> inline void iv_extend<0>(int * /*r*/, int /*ext*/) { }
-  template<> inline void iv_extend<1>(int *r, int ext) {
-    r[0] = ext;
-  }
-  template<> inline void iv_extend<2>(int *r, int ext) {
-    r[0] = ext;
-    r[1] = ext;
-  }
-
-  template<int Nr>
-  inline void iv_assign_int64(int *r, Slong l) {
-    r[0] = (int) l;
-    if(Nr > 1) {
-      r[1] = (int) (l >> 32);
-      iv_extend<Nr-2>(r+2, (r[1] < 0) ? ~0 : 0);
-    }
-  }
-  template<> inline void iv_assign_int64<1>(int *r, Slong l) {
-    r[0] = (int) l;
-  }
-  template<> inline void iv_assign_int64<2>(int *r, Slong l) {
-    r[0] = (int) l;
-    r[1] = (int) (l >> 32);
-  }
-
-  template<int Nr>
-  inline void iv_assign_uint64(int *r, Ulong l) {
-    r[0] = (int) l;
-    if(Nr > 1) {
-      r[1] = (int) (l >> 32);
-      iv_extend<Nr-2>(r+2, 0);
-    }
-  }
-  template<> inline void iv_assign_uint64<1>(int *r, Ulong l) {
-    r[0] = (int) l;
-  }
-  template<> inline void iv_assign_uint64<2>(int *r, Ulong l) {
-    r[0] = (int) l;
-    r[1] = (int) (l >> 32);
-  }
-
-  inline Ulong mult_u_u(int a, int b) {
-    return (Ulong) (unsigned) a * (Ulong) (unsigned) b;
-  }
-  inline Slong mult_u_s(int a, int b) {
-    return (Ulong) (unsigned) a * (Slong) (signed) b;
-  }
-  inline Slong mult_s_u(int a, int b) {
-    return (Slong) (signed) a * (Ulong) (unsigned) b;
-  }
-  inline Slong mult_s_s(int a, int b) {
-    return (Slong) (signed) a * (Slong) (signed) b;
-  }
-  inline void accumulate(Ulong a, Ulong &l1, Slong &l2) {
-    l1 += (Ulong) (unsigned) a;
-    l2 += a >> 32;
-  }
-  inline void accumulate(Slong a, Ulong &l1, Slong &l2) {
-    l1 += (Ulong) (unsigned) a;
-    l2 += a >> 32;
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_mult(const int *op1, const int *op2, int *r) {
-    if(Nr==1)
-      r[0] = op1[0] * op2[0];
-    else if(N1==1 && N2==1)
-      iv_assign_int64<Nr>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
-    else {
-      const int M1 = AC_MAX(N1,N2);
-      const int M2 = AC_MIN(N1,N2);
-      const int *OP1 = N1 >= N2 ? op1 : op2;
-      const int *OP2 = N1 >= N2 ? op2 : op1;
-      const int T1 = AC_MIN(M2-1,Nr);
-      const int T2 = AC_MIN(M1-1,Nr);
-      const int T3 = AC_MIN(M1+M2-2,Nr);
-
-      Ulong l1 = 0;
-      Slong l2 = 0;
-      for(int k=0; k < T1; k++) {
-        for(int i=0; i < k+1; i++)
-          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
-        l2 += (Ulong) (unsigned) (l1 >> 32);
-        r[k] = (int) l1;
-        l1 = (unsigned) l2;
-        l2 >>= 32;
-      }
-      for(int k=T1; k < T2; k++) {
-        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
-        for(int i=0; i < M2-1; i++)
-          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
-        l2 += (Ulong) (unsigned) (l1 >> 32);
-        r[k] = (int) l1;
-        l1 = (unsigned) l2;
-        l2 >>= 32;
-      }
-      for(int k=T2; k < T3; k++) {
-        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
-        for(int i=k-T2+1; i < M2-1; i++)
-          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
-        accumulate(mult_s_u(OP1[M1-1], OP2[k-M1+1]), l1, l2);
-        l2 += (Ulong) (unsigned) (l1 >> 32);
-        r[k] = (int) l1;
-        l1 = (unsigned) l2;
-        l2 >>= 32;
-      }
-      if(Nr >= M1+M2-1) {
-        accumulate(mult_s_s(OP1[M1-1], OP2[M2-1]), l1, l2);
-        r[M1+M2-2] = (int) l1;
-        if(Nr >= M1+M2) {
-          l2 += (Ulong) (unsigned) (l1 >> 32);
-          r[M1+M2-1] = (int) l2;
-          iv_extend<Nr-(M1+M2)>(r+M1+M2, (r[M1+M2-1] < 0) ? ~0 : 0);
-        }
-      }
-    }
-  }
-  template<> inline void iv_mult<1,1,1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] * op2[0];
-  }
-  template<> inline void iv_mult<1,1,2>(const int *op1, const int *op2, int *r) {
-    iv_assign_int64<2>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
-  }
-
-  template<int N>
-  inline bool iv_uadd_carry(const int *op1, bool carry, int *r) {
-    Slong l = carry;
-    for(int i=0; i < N; i++) {
-      l += (Ulong) (unsigned) op1[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    return l != 0;
-  }
-  template<> inline bool iv_uadd_carry<0>(const int * /*op1*/, bool carry, int * /*r*/) { return carry; }
-  template<> inline bool iv_uadd_carry<1>(const int *op1, bool carry, int *r) {
-    Ulong l = carry + (Ulong) (unsigned) op1[0];
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_add_int_carry(const int *op1, int op2, bool carry, int *r) {
-    if(N==0)
-      return carry;
-    if(N==1) {
-      Ulong l = carry + (Slong) op1[0] + (Slong) op2;
-      r[0] = (int) l;
-      return (l >> 32) & 1;
-    }
-    Slong l = carry + (Ulong) (unsigned) op1[0] + (Slong) op2;
-    r[0] = (int) l;
-    l >>= 32;
-    for(int i=1; i < N-1; i++) {
-      l += (Ulong) (unsigned) op1[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    l += (Slong) op1[N-1];
-    r[N-1] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_add_int_carry<0>(const int * /*op1*/, int /*op2*/, bool carry, int * /*r*/) { return carry; }
-  template<> inline bool iv_add_int_carry<1>(const int *op1, int op2, bool carry, int *r) {
-    Ulong l = carry + (Slong) op1[0] + (Slong) op2;
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_uadd_n(const int *op1, const int *op2, int *r) {
-    Ulong l = 0;
-    for(int i=0; i < N; i++) {
-      l += (Ulong)(unsigned) op1[i] + (Ulong)(unsigned) op2[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    return l & 1;
-  }
-  template<> inline bool iv_uadd_n<0>(const int * /*op1*/, const int * /*op2*/, int * /*r*/) { return false; }
-  template<> inline bool iv_uadd_n<1>(const int *op1, const int *op2, int *r) {
-    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_uadd_n<2>(const int *op1, const int *op2, int *r) {
-    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    l >>= 32;
-    l += (Ulong) (unsigned) op1[1] + (Ulong) (unsigned) op2[1];
-    r[1] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_add(const int *op1, const int *op2, int *r) {
-    if(Nr==1)
-      r[0] = op1[0] + op2[0];
-    else {
-      const int M1 = AC_MAX(N1,N2);
-      const int M2 = AC_MIN(N1,N2);
-      const int *OP1 = N1 >= N2 ? op1 : op2;
-      const int *OP2 = N1 >= N2 ? op2 : op1;
-      const int T1 = AC_MIN(M2-1,Nr);
-      const int T2 = AC_MIN(M1,Nr);
-
-      bool carry = iv_uadd_n<T1>(OP1, OP2, r);
-      carry = iv_add_int_carry<T2-T1>(OP1+T1, OP2[T1], carry, r+T1);
-      iv_extend<Nr-T2>(r+T2, carry ? ~0 : 0);
-    }
-  }
-  template<> inline void iv_add<1,1,1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] + op2[0];
-  }
-  template<> inline void iv_add<1,1,2>(const int *op1, const int *op2, int *r) {
-    iv_assign_int64<2>(r, (Slong) op1[0] + (Slong) op2[0]);
-  }
-
-  template<int N>
-  inline bool iv_sub_int_borrow(const int *op1, int op2, bool borrow, int *r) {
-    if(N==1) {
-      Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
-      r[0] = (int) l;
-      return (l >> 32) & 1;
-    }
-    Slong l = (Ulong) (unsigned) op1[0] - (Slong) op2 - borrow;
-    r[0] = (int) l;
-    l >>= 32;
-    for(int i=1; i < N-1; i++) {
-      l += (Ulong) (unsigned) op1[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    l += (Slong) op1[N-1];
-    r[N-1] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_sub_int_borrow<0>(const int * /*op1*/, int /*op2*/, bool borrow, int * /*r*/) { return borrow; }
-  template<> inline bool iv_sub_int_borrow<1>(const int *op1, int op2, bool borrow, int *r) {
-    Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_sub_int_borrow(int op1, const int *op2, bool borrow, int *r) {
-    if(N==1) {
-      Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
-      r[0] = (int) l;
-      return (l >> 32) & 1;
-    }
-    Slong l = (Slong) op1 - (Ulong) (unsigned) op2[0] - borrow;
-    r[0] = (int) l;
-    l >>= 32;
-    for(int i=1; i < N-1; i++) {
-      l -= (Ulong) (unsigned) op2[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    l -= (Slong) op2[N-1];
-    r[N-1] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_sub_int_borrow<0>(int /*op1*/, const int * /*op2*/, bool borrow, int * /*r*/) { return borrow; }
-  template<> inline bool iv_sub_int_borrow<1>(int op1, const int *op2, bool borrow, int *r) {
-    Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N>
-  inline bool iv_usub_n(const int *op1, const int *op2, int *r) {
-    Slong l = 0;
-    for(int i=0; i < N; i++) {
-      l += (Ulong)(unsigned) op1[i] - (Ulong)(unsigned) op2[i];
-      r[i] = (int) l;
-      l >>= 32;
-    }
-    return l & 1;
-  }
-  template<> inline bool iv_usub_n<1>(const int *op1, const int *op2, int *r) {
-    Ulong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    return (l >> 32) & 1;
-  }
-  template<> inline bool iv_usub_n<2>(const int *op1, const int *op2, int *r) {
-    Slong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
-    r[0] = (int) l;
-    l >>= 32;
-    l += (Ulong) (unsigned) op1[1] - (Ulong) (unsigned) op2[1];
-    r[1] = (int) l;
-    return (l >> 32) & 1;
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_sub(const int *op1, const int *op2, int *r) {
-    if(Nr==1)
-      r[0] = op1[0] - op2[0];
-    else {
-      const int M1 = AC_MAX(N1,N2);
-      const int M2 = AC_MIN(N1,N2);
-      const int T1 = AC_MIN(M2-1,Nr);
-      const int T2 = AC_MIN(M1,Nr);
-      bool borrow = iv_usub_n<T1>(op1, op2, r);
-      if(N1 > N2)
-        borrow = iv_sub_int_borrow<T2-T1>(op1+T1, op2[T1], borrow, r+T1);
-      else
-        borrow = iv_sub_int_borrow<T2-T1>(op1[T1], op2+T1, borrow, r+T1);
-      iv_extend<Nr-T2>(r+T2, borrow ? ~0 : 0);
-    }
-  }
-  template<> inline void iv_sub<1,1,1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] - op2[0];
-  }
-  template<> inline void iv_sub<1,1,2>(const int *op1, const int *op2, int *r) {
-    iv_assign_int64<2>(r, (Slong) op1[0] - (Slong) op2[0]);
-  }
-
-  template<int N>
-  inline bool iv_all_bits_same(const int *op, bool bit) {
-    int t = bit ? ~0 : 0;
-    for(int i=0; i < N; i++)
-      if(op[i] != t)
-        return false;
-    return true;
-  }
-  template<> inline bool iv_all_bits_same<0>(const int * /*op*/, bool /*bit*/) { return true; }
-  template<> inline bool iv_all_bits_same<1>(const int *op, bool bit) {
-    return op[0] == (bit ? ~0 : 0);
-  }
-
-  template <int N, int Nr>
-  void iv_neg(const int *op1, int *r) {
-    Slong l = 0;
-    for(int k = 0; k < AC_MIN(N,Nr); k++) {
-      l -= (Ulong) (unsigned) op1[k];
-      r[k] = (unsigned) l;
-      l >>= 32;
-    }
-    if(Nr > N) {
-      r[N] = (unsigned) (l - (op1[N-1] < 0 ? ~0 : 0));
-      iv_extend<Nr-N-1>(r+N+1, r[N] < 0 ? ~0 : 0);
-    }
-  }
-
-  template <int N, bool S, int Nr>
-  void iv_abs(const int *op1, int *r) {
-    if( S && op1[N-1] < 0) {
-      iv_neg<N,Nr>(op1, r);
-    } else {
-      iv_copy<AC_MIN(N,Nr)>(op1, r);
-      iv_extend<Nr-N>(r+N, 0);
-    }
-  }
-
-  template<int N, int D, int Q, int R, typename sw2, typename uw2, typename sw4, typename uw4, int w1_length>
-  void iv_udiv(const sw2 *n, const sw2 *d, sw2 *q, sw2 *r) {
-    const int w2_length = 2*w1_length;
-    int d_msi;  // most significant int for d
-    for(d_msi = D-1; d_msi > 0 && !d[d_msi]; d_msi--) {}
-    uw4 d1 = 0;
-    if(!d_msi && !d[0]) {
-      d1 = n[0]/d[0];  // d is zero => divide by zero
-      return;
-    }
-    int n_msi;  // most significant int for n
-    for(n_msi = N-1; n_msi > 0 && !n[n_msi]; n_msi--) {}
-    for(int i=0; i < Q; i++)
-      q[i] = 0;
-    for(int i=0; i < R; i++)
-      r[i] = n[i];
-    // write most significant "words" into d1
-    bool d_mss_odd = (bool) (d[d_msi] >> w1_length);
-    int d_mss= 2*d_msi + d_mss_odd;  // index to most significant short (16-bit)
-    d1 = (uw4) (uw2) d[d_msi] << (w1_length << (int) !d_mss_odd);
-    if(d_msi)
-      d1 |= (uw2) d[d_msi-1] >> (d_mss_odd ? w1_length : 0);
-    bool n_mss_odd = (bool) (n[n_msi] >> w1_length);
-    int n_mss = 2*n_msi + n_mss_odd;
-    if(n_mss < d_mss) {
-      // q already initialized to 0
-      if(R) {
-        int r_msi = AC_MIN(R-1, n_msi);
-        for(int j = 0; j <= r_msi; j++)
-          r[j] = n[j];
-        for(int j = r_msi+1; j < R; j++)
-          r[j] = 0;
-      }
-    } else {
-      uw2 r1[N+1];
-      r1[n_msi+1] = 0;
-      for(int k = n_msi; k >= 0; k--)
-        r1[k] = n[k];
-      for(int k = n_mss; k >=d_mss; k--) {
-        int k_msi = k >> 1;
-        bool odd = k & 1;
-        uw2 r1m1 = k_msi > 0 ? r1[k_msi-1] : (uw2) 0;
-        uw4 n1 = odd ?
-          (uw4) ((r1[k_msi+1] << w1_length) | (r1[k_msi] >> w1_length)) << w2_length | ((r1[k_msi] << w1_length) | (r1m1 >> w1_length)) :
-          (uw4) r1[k_msi] << w2_length | r1m1;
-        uw2 q1 = n1/d1;
-        if(q1 >> w1_length)
-          q1--;
-        AC_ASSERT(!(q1 >> w1_length), "Problem detected in long division algorithm, Please report");
-        unsigned k2 = k - d_mss;
-        unsigned k2_i = k2 >> 1;
-        bool odd_2 = k2 & 1;
-        uw2 q2 = q1 << (odd_2 ? w1_length : 0);
-        sw4 l = 0;
-        for(int j = 0; j <= d_msi; j++) {
-          l += r1[k2_i + j];
-          bool l_sign = l < 0;
-          sw4 prod = (uw4) (uw2) d[j] * (uw4) q2;
-          l -= prod;
-          bool ov1 = (l >= 0) & ((prod < 0) | l_sign);
-          bool ov2 = (l < 0) & (prod < 0) & l_sign;
-          r1[k2_i + j] = (uw2) l;
-          l >>= w2_length;
-          if(ov1)
-            l |= ((uw4) -1 << w2_length);
-          if(ov2)
-            l ^= ((sw4) 1 << w2_length);
-        }
-        if(odd_2 | d_mss_odd) {
-          l += r1[k2_i + d_msi + 1];
-          r1[k2_i + d_msi + 1] = (uw2) l;
-        }
-        if(l < 0) {
-          l = 0;
-          for(int j = 0; j <= d_msi; j++) {
-            l += (sw4) (uw2) d[j] << (odd_2 ? w1_length : 0);
-            l += r1[k2_i + j];
-            r1[k2_i + j] = (uw2) l;
-            l >>= w2_length;
-          }
-          if(odd_2 | d_mss_odd)
-            r1[k2_i + d_msi + 1] += (uw2) l;
-          q1--;
-        }
-        if(Q && k2_i < Q) {
-          if(odd_2)
-            q[k2_i] = q1 << w1_length;
-          else
-            q[k2_i] |= q1;
-        }
-      }
-      if(R) {
-        int r_msi = AC_MIN(R-1, n_msi);
-        for(int j = 0; j <= r_msi; j++)
-          r[j] = r1[j];
-        for(int j = r_msi+1; j < R; j++)
-          r[j] = 0;
-      }
-    }
-  }
-
-  template<int N1, int Num_s, int N2, int Den_s, int Nr>
-  inline void iv_div(const int *op1, const int *op2, int *r) {
-    enum { N1_over = N1+(Den_s && (Num_s==2)) };
-    if(N1_over==1 && N2==1) {
-      r[0] = op1[0] / op2[0];
-      iv_extend<Nr-N1>(r+1, ((Num_s || Den_s) && (r[0] < 0)) ? ~0 : 0);
-    }
-    else if(N1_over==1 && N2==2)
-      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(N1_over==2 && N2==1)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / ( (Slong) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / ( (Slong) op2[0]) );
-    else if(N1_over==2 && N2==2)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(!Num_s && !Den_s) {
-      iv_udiv<N1,N2,Nr,0,int,unsigned,Slong,Ulong,16>(op1, op2, r, 0);
-    }
-    else {
-      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
-      int numerator[N1_neg];
-      int denominator[N2_neg];
-      int quotient[N1_neg];
-      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
-      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
-      iv_udiv<N1_neg,N2_neg,N1_neg,0,int,unsigned,Slong,Ulong,16>(numerator, denominator, quotient, 0);
-      if( (Num_s && op1[N1-1] < 0) ^ (Den_s && op2[N2-1] < 0) )
-        iv_neg<N1_neg, Nr>(quotient, r);
-      else {
-        iv_copy<AC_MIN(N1_neg,Nr)>(quotient, r);
-        iv_extend<Nr-N1_neg>(r+N1_neg, (Num_s || Den_s) && r[N1_neg-1] < 0 ? ~0 : 0);
-      }
-    }
-  }
-
-  template<int N1, int Num_s, int N2, int Den_s, int Nr>
-  inline void iv_rem(const int *op1, const int *op2, int *r) {
-    enum { N1_over = N1+(Den_s && (Num_s==2)) };   // N1_over corresponds to the division
-    if(N1_over==1 && N2==1) {
-      r[0] = op1[0] % op2[0];
-      iv_extend<Nr-1>(r+1, Num_s && r[0] < 0 ? ~0 : 0);
-    }
-    else if(N1_over==1 && N2==2)
-      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(N1_over==2 && N2==1)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % ( (Slong) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % ( (Slong) op2[0]) );
-    else if(N1_over==2 && N2==2)
-      if(N1 == 1)
-        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-      else
-        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
-    else if(!Num_s && !Den_s) {
-      iv_udiv<N1,N2,0,Nr,int,unsigned,Slong,Ulong,16>(op1, op2, 0, r);
-    }
-    else {
-      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
-      int numerator[N1_neg];
-      int denominator[N2_neg];
-      int remainder[N2];
-      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
-      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
-      iv_udiv<N1_neg,N2_neg,0,N2,int,unsigned,Slong,Ulong,16>(numerator, denominator, 0, remainder);
-      if( (Num_s && op1[N1-1] < 0) )
-        iv_neg<N2, Nr>(remainder, r);
-      else {
-        iv_copy<AC_MIN(N2,Nr)>(remainder, r);
-        iv_extend<Nr-N2>(r+N2, Num_s && r[N2-1] < 0 ? ~0 : 0);
-      }
-    }
-  }
-
-  template<int N>
-  inline void iv_bitwise_complement_n(const int *op, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = ~op[i];
-  }
-  template<> inline void iv_bitwise_complement_n<1>(const int *op, int *r) {
-    r[0] = ~op[0];
-  }
-  template<> inline void iv_bitwise_complement_n<2>(const int *op, int *r) {
-    r[0] = ~op[0];
-    r[1] = ~op[1];
-  }
-
-  template<int N, int Nr>
-  inline void iv_bitwise_complement(const int *op, int *r) {
-    const int M = AC_MIN(N,Nr);
-    iv_bitwise_complement_n<M>(op, r);
-    iv_extend<Nr-M>(r+M, (r[M-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N>
-  inline void iv_bitwise_and_n(const int *op1, const int *op2, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op1[i] & op2[i];
-  }
-  template<> inline void iv_bitwise_and_n<1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] & op2[0];
-  }
-  template<> inline void iv_bitwise_and_n<2>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] & op2[0];
-    r[1] = op1[1] & op2[1];
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_bitwise_and(const int *op1, const int *op2, int *r) {
-    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
-    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
-    const int *OP1 = N1 > N2 ? op1 : op2;
-    const int *OP2 = N1 > N2 ? op2 : op1;
-
-    iv_bitwise_and_n<M2>(op1, op2, r);
-    if(OP2[M2-1] < 0)
-      iv_copy<M1-M2>(OP1+M2, r+M2);
-    else
-      iv_extend<M1-M2>(r+M2, 0);
-    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N>
-  inline void iv_bitwise_or_n(const int *op1, const int *op2, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op1[i] | op2[i];
-  }
-  template<> inline void iv_bitwise_or_n<1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] | op2[0];
-  }
-  template<> inline void iv_bitwise_or_n<2>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] | op2[0];
-    r[1] = op1[1] | op2[1];
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_bitwise_or(const int *op1, const int *op2, int *r) {
-    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
-    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-
-    iv_bitwise_or_n<M2>(op1, op2, r);
-    if(OP2[M2-1] < 0)
-      iv_extend<M1-M2>(r+M2, ~0);
-    else
-      iv_copy<M1-M2>(OP1+M2, r+M2);
-    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N>
-  inline void iv_bitwise_xor_n(const int *op1, const int *op2, int *r) {
-    for(int i=0; i < N; i++)
-      r[i] = op1[i] ^ op2[i];
-  }
-  template<> inline void iv_bitwise_xor_n<1>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] ^ op2[0];
-  }
-  template<> inline void iv_bitwise_xor_n<2>(const int *op1, const int *op2, int *r) {
-    r[0] = op1[0] ^ op2[0];
-    r[1] = op1[1] ^ op2[1];
-  }
-
-  template<int N1, int N2, int Nr>
-  inline void iv_bitwise_xor(const int *op1, const int *op2, int *r) {
-    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
-    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
-    const int *OP1 = N1 >= N2 ? op1 : op2;
-    const int *OP2 = N1 >= N2 ? op2 : op1;
-
-    iv_bitwise_xor_n<M2>(op1, op2, r);
-    if(OP2[M2-1] < 0)
-      iv_bitwise_complement_n<M1-M2>(OP1+M2, r+M2);
-    else
-      iv_copy<M1-M2>(OP1+M2, r+M2);
-    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
-  }
-
-  template<int N, int Nr>
-  inline void iv_shift_l(const int *op1, unsigned op2, int *r) {
-    AC_ASSERT(Nr <= N, "iv_shift_l, incorrect usage Nr > N");
-    unsigned s31 = op2 & 31;
-    unsigned ishift = (op2 >> 5) > Nr ? Nr : (op2 >> 5);
-    if(s31 && ishift!=Nr) {
-      unsigned lw = 0;
-      for(unsigned i=0; i < Nr; i++) {
-        unsigned hw = (i >= ishift) ? op1[i-ishift] : 0;
-        r[i] = (hw << s31) | (lw >> (32-s31));
-        lw = hw;
-      }
-    } else {
-      for(unsigned i=0; i < Nr ; i++)
-        r[i] = (i >= ishift) ? op1[i-ishift] : 0;
-    }
-  }
-
-  template<int N, int Nr>
-  inline void iv_shift_r(const int *op1, unsigned op2, int *r) {
-    unsigned s31 = op2 & 31;
-    unsigned ishift = (op2 >> 5) > N ? N : (op2 >> 5);
-    int ext = op1[N-1] < 0 ? ~0 : 0;
-    if(s31 && ishift!=N) {
-      unsigned lw = (ishift < N) ? op1[ishift] : ext;
-      for(unsigned i=0; i < Nr; i++) {
-        unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
-        r[i] = (lw >> s31) | (hw << (32-s31));
-        lw = hw;
-      }
-    } else {
-      for(unsigned i=0; i < Nr ; i++)
-        r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
-    }
-  }
-
-  template<int N, int Nr, bool S>
-  inline void iv_shift_l2(const int *op1, signed op2, int *r) {
-    if(S && op2 < 0)
-      iv_shift_r<N,Nr>(op1, -op2, r);
-    else
-      iv_shift_l<N,Nr>(op1, op2, r);
-  }
-
-  template<> inline void iv_shift_l2<1,1,false>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0;
-  }
-  template<> inline void iv_shift_l2<1,1,true>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 >= 0) ?
-      (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0 :
-      (op2 > -32) ? (op1[0] >> -op2) : (op1[0] >> 31);
-  }
-
-  template<int N, int Nr, bool S>
-  inline void iv_shift_r2(const int *op1, signed op2, int *r) {
-    if(S && op2 < 0)
-      iv_shift_l<N,Nr>(op1, -op2, r);
-    else
-      iv_shift_r<N,Nr>(op1, op2, r);
-  }
-
-  template<> inline void iv_shift_r2<1,1,false>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31);
-  }
-  template<> inline void iv_shift_r2<1,1,true>(const int *op1, signed op2, int *r) {
-    r[0] = (op2 >= 0) ?
-      (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31) :
-      (op2 > -32) ? ( (unsigned) op1[0] << -op2) : 0;
-  }
-
-  template<int N, int Nr, int B>
-  inline void iv_const_shift_l(const int *op1, int *r) {
-    // B >= 0
-    if(!B) {
-      const int M1 = AC_MIN(N,Nr);
-      iv_copy<M1>(op1, r);
-      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
-    }
-    else {
-      const unsigned s31 = B & 31;
-      const int ishift = (((B >> 5) > Nr) ? Nr : (B >> 5));
-      iv_extend<ishift>(r, 0);
-      const int M1 = AC_MIN(N+ishift,Nr);
-      if(s31) {
-        unsigned lw = 0;
-        for(int i=ishift; i < M1; i++) {
-          unsigned hw = op1[i-ishift];
-          r[i] = (hw << s31) | (lw >> ((32-s31)&31));  // &31 is to quiet compilers
-          lw = hw;
-        }
-        if(Nr > M1) {
-          r[M1] = (signed) lw >> ((32-s31)&31);  // &31 is to quiet compilers
-          iv_extend<Nr-M1-1>(r+M1+1, r[M1] < 0 ? ~0 : 0);
-        }
-      } else {
-        for(int i=ishift; i < M1 ; i++)
-          r[i] = op1[i-ishift];
-        iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
-      }
-    }
-  }
-  template<> inline void iv_const_shift_l<1,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-  template<> inline void iv_const_shift_l<2,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-
-  template<int N, int Nr, int B>
-  inline void iv_const_shift_r(const int *op1, int *r) {
-    if(!B) {
-      const int M1 = AC_MIN(N,Nr);
-      iv_copy<M1>(op1, r);
-      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? ~0 : 0);
-    }
-    else {
-      const unsigned s31 = B & 31;
-      const int ishift = (((B >> 5) > N) ? N : (B >> 5));
-      int ext = op1[N-1] < 0 ? ~0 : 0;
-      if(s31 && ishift!=N) {
-        unsigned lw = (ishift < N) ? op1[ishift] : ext;
-        for(int i=0; i < Nr; i++) {
-          unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
-          r[i] = (lw >> s31) | (hw << ((32-s31)&31));  // &31 is to quiet compilers
-          lw = hw;
-        }
-      } else {
-        for(int i=0; i < Nr ; i++)
-          r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
-      }
-    }
-  }
-  template<> inline void iv_const_shift_r<1,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-  template<> inline void iv_const_shift_r<2,1,0>(const int *op1, int *r) {
-    r[0] = op1[0];
-  }
-
-  template<int N>
-  inline void iv_conv_from_fraction(double d, int *r, bool *qb, bool *rbits, bool *o) {
-    bool b = d < 0;
-    double d2 = b ? -d : d;
-    double dfloor = mgc_floor(d2);
-    *o = dfloor != 0.0;
-    d2 = d2 - dfloor;
-    for(int i=N-1; i >=0; i--) {
-      d2 *= (Ulong) 1 << 32;
-      unsigned k = (unsigned int) d2;
-      r[i] = b ? ~k : k;
-      d2 -= k;
-    }
-    d2 *= 2;
-    bool k = ((int) d2) != 0;  // is 0 or 1
-    d2 -= k;
-    *rbits = d2 != 0.0;
-    *qb = (b && *rbits) ^ k;
-    if(b && !*rbits && !*qb)
-      iv_uadd_carry<N>(r, true, r);
-    *o |= b ^ (r[N-1] < 0);
-  }
-
-  template<ac_base_mode b>
-  inline int to_str(int *v, int w, bool left_just, char *r) {
-    const char digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
-    const unsigned char B = b==AC_BIN ? 1 : (b==AC_OCT ? 3 : (b==AC_HEX ? 4 : 0));
-    int k = (w+B-1)/B;
-    int n = (w+31) >> 5;
-    int bits = 0;
-    if(b != AC_BIN && left_just) {
-      if( (bits = -(w % B)) )
-        r[--k] = 0;
-    }
-    for(int i = 0; i < n; i++) {
-      if (b != AC_BIN && bits < 0)
-        r[k] += (unsigned char) (( (unsigned) v[i] << (B+bits)) & (b-1));
-      unsigned int m = (unsigned) v[i] >> -bits;
-      for(bits += 32; bits > 0 && k; bits -= B) {
-        r[--k] = (char) (m & (b-1));
-        m >>= B;
-      }
-    }
-    for(int i=0; i < (w+B-1)/B; i++)
-      r[i] = digits[(int)r[i]];
-    return (w+B-1)/B;
-  }
-  template<> inline int to_str<AC_DEC>(int *v, int w, bool left_just, char *r) {
-    int k = 0;
-    int msw = (w-1) >> 5;
-    if(left_just) {
-      unsigned bits_msw = w & 31;
-      if(bits_msw) {
-        unsigned left_shift = 32 - bits_msw;
-        for(int i=msw; i > 0; i--)
-          v[i] = (unsigned) v[i] << left_shift | (unsigned) v[i-1] >> bits_msw;
-        v[0] = (unsigned) v[0] << left_shift;
-      }
-      int lsw = 0;
-      while(lsw < msw || v[msw] ) {
-        Ulong l = 0;
-        for(int i=lsw; i <= msw; i++) {
-          l += (Ulong) (unsigned) v[i] * 10;
-          v[i] = l;
-          l >>= 32;
-          if(i==lsw && !v[i])
-            lsw++;
-        }
-        r[k++] = (char) ('0' + (int) l);
-      }
-    } else {
-      const unsigned d = 1000000000;   // 10E9
-      for(; msw > 0 && !v[msw]; msw--) {}
-      while(msw >= 0) {
-        Ulong nl = 0;
-        for(int i = msw; i >= 0; i--) {
-          nl <<= 32;
-          nl |= (unsigned) v[i];
-          unsigned q = nl/d;
-          nl -= (Ulong) q * d;
-          v[i] = q;
-        }
-        if(!v[msw])
-          msw--;
-        bool last = msw == -1;
-        unsigned rem = (unsigned) nl;
-        for(int i=0; (i < 9 && !last) || rem; i++) {
-          r[k++] = (char) ('0' + (int) (rem % 10));
-          rem /= 10;
-        }
-      }
-      for(int i=0; i < k/2; i++) {
-        char c = r[i];
-        r[i] = r[k-1-i];
-        r[k-1-i] = c;
-      }
-    }
-    r[k] = 0;
-    return k;
-  }
-
-  inline int to_string(int *v, int w, bool sign_mag, ac_base_mode base, bool left_just, char *r) {
-    int n = (w+31) >> 5;
-    bool neg = !sign_mag && v[n-1] < 0;
-    if(!left_just) {
-      while(n-- && v[n] == (neg ? ~0 : 0)) {}
-      int w2 = 32*(n+1);
-      if(w2) {
-        int m = v[n];
-        for(int i = 16; i > 0; i >>= 1) {
-          if((m >> i) == (neg ? ~0 : 0))
-            w2 -= i;
-          else
-            m >>= i;
-        }
-      }
-      if(w2 < w)
-        w = w2;
-      w += !sign_mag;
-    }
-    if(base == AC_DEC)
-      return to_str<AC_DEC>(v, w, left_just, r);
-    else if (base == AC_HEX)
-      return to_str<AC_HEX>(v, w, left_just, r);
-    else if (base == AC_OCT)
-      return to_str<AC_OCT>(v, w, left_just, r);
-    else if (base == AC_BIN)
-      return to_str<AC_BIN>(v, w, left_just, r);
-    return 0;
-  }
-
-  template<int N>
-  inline unsigned iv_leading_bits(const int *op, bool bit);
-
-  template<> inline unsigned iv_leading_bits<1>(const int *op, bool bit) {
-    const unsigned char tab[] = {4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
-    unsigned t = bit ? ~*op : *op;
-    unsigned cnt = 0;
-    if(t >> 16)
-      t >>= 16;
-    else
-      cnt += 16;
-    if(t >> 8)
-      t >>= 8;
-    else
-      cnt += 8;
-    if(t >> 4)
-      t >>= 4;
-    else
-      cnt += 4;
-    cnt += tab[t];
-    return cnt;
-  }
-
-  template<int N>
-  inline unsigned iv_leading_bits(const int *op, bool bit) {
-    int ext_sign = bit ? -1 : 0;
-    int k;
-    for(k = N-1; k >= 0 && op[k] == ext_sign; k--) {}
-    return 32*(N-1-k) + (k < 0 ? 0 : iv_leading_bits<1>(op+k, bit));
-  }
-
-  //////////////////////////////////////////////////////////////////////////////
-  //  Integer Vector class: iv
-  //////////////////////////////////////////////////////////////////////////////
-  template<int N>
-  class iv {
-  protected:
-    int v[N];
-  public:
-    template<int N2> friend class iv;
-    iv() {}
-    template<int N2>
-    iv ( const iv<N2> &b ) {
-      const int M = AC_MIN(N,N2);
-      iv_copy<M>(b.v, v);
-      iv_extend<N-M>(v+M, (v[M-1] < 0) ? ~0 : 0);
-    }
-    iv ( Slong t) {
-      iv_assign_int64<N>(v, t);
-    }
-    iv ( Ulong t) {
-      iv_assign_uint64<N>(v, t);
-    }
-    iv ( int t) {
-      v[0] = t;
-      iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
-    }
-    iv ( unsigned int t) {
-      v[0] = t;
-      iv_extend<N-1>(v+1, 0);
-    }
-    iv ( long t) {
-      if(long_w == 32) {
-        v[0] = t;
-        iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
-      } else
-        iv_assign_int64<N>(v, t);
-    }
-    iv ( unsigned long t) {
-      if(long_w == 32) {
-        v[0] = t;
-        iv_extend<N-1>(v+1, 0);
-      } else
-        iv_assign_uint64<N>(v, t);
-    }
-    iv ( double d ) {
-      double d2 = ldexpr32<-N>(d);
-      bool qb, rbits, o;
-      iv_conv_from_fraction<N>(d2, v, &qb, &rbits, &o);
-    }
-
-    // Explicit conversion functions to C built-in types -------------
-    inline Slong to_int64() const { return N==1 ? v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
-    inline Ulong to_uint64() const { return N==1 ? (Ulong) v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
-    inline double to_double() const {
-      double a = v[N-1];
-      for(int i=N-2; i >= 0; i--) {
-        a *= (Ulong) 1 << 32;
-        a += (unsigned) v[i];
-      }
-      return a;
-    }
-    inline void conv_from_fraction(double d, bool *qb, bool *rbits, bool *o) {
-      iv_conv_from_fraction<N>(d, v, qb, rbits, o);
-    }
-
-    template<int N2, int Nr>
-    inline void mult(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_mult<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void add(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_add<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void sub(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_sub<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int Num_s, int N2, int Den_s, int Nr>
-    void div(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_div<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
-    }
-    template<int Num_s, int N2, int Den_s, int Nr>
-    void rem(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_rem<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
-    }
-    void increment() {
-      iv_uadd_carry<N>(v, true, v);
-    }
-    void decrement() {
-      iv_sub_int_borrow<N>(v, 0, true, v);
-    }
-    template<int Nr>
-    void neg(iv<Nr> &r) const {
-      iv_neg<N,Nr>(v, r.v);
-    }
-    template<int Nr>
-    void shift_l(unsigned op2, iv<Nr> &r) const {
-      iv_shift_l<N,Nr>(v, op2, r.v);
-    }
-    template<int Nr>
-    void shift_l2(signed op2, iv<Nr> &r) const {
-      iv_shift_l2<N,Nr,true>(v, op2, r.v);
-    }
-    template<int Nr>
-    void shift_r(unsigned op2, iv<Nr> &r) const {
-      iv_shift_r<N,Nr>(v, op2, r.v);
-    }
-    template<int Nr>
-    void shift_r2(signed op2, iv<Nr> &r) const {
-      iv_shift_r2<N,Nr,true>(v, op2, r.v);
-    }
-    template<int Nr, int B>
-    void const_shift_l(iv<Nr> &r) const {
-      iv_const_shift_l<N,Nr,B>(v, r.v);
-    }
-    template<int Nr, int B>
-    void const_shift_r(iv<Nr> &r) const {
-      iv_const_shift_r<N,Nr,B>(v, r.v);
-    }
-    template<int Nr>
-    void bitwise_complement(iv<Nr> &r) const {
-      iv_bitwise_complement<N,Nr>(v, r.v);
-    }
-    template<int N2, int Nr>
-    void bitwise_and(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_bitwise_and<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void bitwise_or(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_bitwise_or<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2, int Nr>
-    void bitwise_xor(const iv<N2> &op2, iv<Nr> &r) const {
-      iv_bitwise_xor<N,N2,Nr>(v, op2.v, r.v);
-    }
-    template<int N2>
-    bool equal(const iv<N2> &op2) const {
-      return iv_equal<N,N2>(v, op2.v);
-    }
-    template<int N2>
-    bool greater_than(const iv<N2> &op2) const {
-      return iv_compare<N,N2,true>(v, op2.v);
-    }
-    template<int N2>
-    bool less_than(const iv<N2> &op2) const {
-      return iv_compare<N,N2,false>(v, op2.v);
-    }
-    bool equal_zero() const {
-      return iv_equal_zero<N>(v);
-    }
-    template<int N2>
-    void set_slc(unsigned lsb, int WS, const iv<N2> &op2) {
-      AC_ASSERT((31+WS)/32 == N2, "Bad usage: WS greater than length of slice");
-      unsigned msb = lsb+WS-1;
-      unsigned lsb_v = lsb >> 5;
-      unsigned lsb_b = lsb & 31;
-      unsigned msb_v = msb >> 5;
-      unsigned msb_b = msb & 31;
-      if(N2==1) {
-        if(msb_v == lsb_v)
-          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb_b);
-        else {
-          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
-          unsigned m = (((unsigned) op2.v[0] >> 1) >> (31-lsb_b));
-          v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
-        }
-      } else {
-        v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
-        for(int i = 1; i < N2-1; i++)
-          v[lsb_v+i] = ((unsigned) op2.v[i] << lsb_b) | (((unsigned) op2.v[i-1] >> 1) >> (31-lsb_b));
-        unsigned t = ((unsigned) op2.v[N2-1] << lsb_b) | (((unsigned) op2.v[N2-2] >> 1) >> (31-lsb_b));
-        unsigned m;
-        if(msb_v-lsb_v == N2) {
-          v[msb_v-1] = t;
-          m = (((unsigned) op2.v[N2-1] >> 1) >> (31-lsb_b));
-        }
-        else
-          m = t;
-        v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
-      }
-    }
-    unsigned leading_bits(bool bit) const {
-      return iv_leading_bits<N>(v, bit);
-    }
-  };
-
-  template<> inline Slong iv<1>::to_int64() const { return v[0]; }
-  template<> inline Ulong iv<1>::to_uint64() const { return v[0]; }
-
-  template<> inline Slong iv<2>::to_int64() const {
-    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
-  }
-  template<> inline Ulong iv<2>::to_uint64() const {
-    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
-  }
-
-  template<> template<> inline void iv<1>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
-    v[0] ^= (v[0] ^ ((unsigned) op2.v[0] << lsb)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb);
-  }
-  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
-    Ulong l = to_uint64();
-    Ulong l2 = op2.to_uint64();
-    l ^= (l ^ (l2 << lsb)) & (~((~(Ulong)0)<<WS) << lsb);  // WS <= 32
-    *this = l;
-  }
-  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<2> &op2) {
-    Ulong l = to_uint64();
-    Ulong l2 = op2.to_uint64();
-    l ^= (l ^ (l2 << lsb)) & (~(WS==64 ? (Ulong) 0 : ~(Ulong)0<<WS) << lsb);
-    *this = l;
-  }
-
-  // add automatic conversion to Slong/Ulong depending on S and C
-  template<int N, bool S, bool C>
-  class iv_conv : public iv<N> {
-  protected:
-    iv_conv() {}
-    template<class T> iv_conv(const T& t) : iv<N>(t) {}
-  };
-
-  template<int N>
-  class iv_conv<N,false,true> : public iv<N> {
-  public:
-    operator Ulong () const { return iv<N>::to_uint64(); }
-  protected:
-    iv_conv() {}
-    template<class T> iv_conv(const T& t) : iv<N>(t) {}
-  };
-
-  template<int N>
-  class iv_conv<N,true,true> : public iv<N> {
-  public:
-    operator Slong () const { return iv<N>::to_int64(); }
-  protected:
-    iv_conv() {}
-    template<class T> iv_conv(const T& t) : iv<N>(t) {}
-  };
-
-  // Set default to promote to int as this is the case for almost all types
-  //  create exceptions using specializations
-  template<typename T>
-  struct c_prom {
-    typedef int promoted_type;
-  };
-  template<> struct c_prom<unsigned> {
-    typedef unsigned promoted_type;
-  };
-  template<> struct c_prom<long> {
-    typedef long promoted_type;
-  };
-  template<> struct c_prom<unsigned long> {
-    typedef unsigned long promoted_type;
-  };
-  template<> struct c_prom<Slong> {
-    typedef Slong promoted_type;
-  };
-  template<> struct c_prom<Ulong> {
-    typedef Ulong promoted_type;
-  };
-  template<> struct c_prom<float> {
-    typedef float promoted_type;
-  };
-  template<> struct c_prom<double> {
-    typedef double promoted_type;
-  };
-
-  template<typename T, typename T2>
-  struct c_arith {
-     // will error out for pairs of T and T2 that are not defined through specialization
-  };
-  template<typename T> struct c_arith<T,T> {
-    typedef T arith_conv;
-  };
-
-  #define C_ARITH(C_TYPE1, C_TYPE2) \
-  template<> struct c_arith<C_TYPE1, C_TYPE2> { \
-    typedef C_TYPE1 arith_conv; \
-  }; \
-  template<> struct c_arith<C_TYPE2, C_TYPE1> { \
-    typedef C_TYPE1 arith_conv; \
-  };
-
-  C_ARITH(double, float)
-  C_ARITH(double, int)
-  C_ARITH(double, unsigned)
-  C_ARITH(double, long)
-  C_ARITH(double, unsigned long)
-  C_ARITH(double, Slong)
-  C_ARITH(double, Ulong)
-  C_ARITH(float, int)
-  C_ARITH(float, unsigned)
-  C_ARITH(float, long)
-  C_ARITH(float, unsigned long)
-  C_ARITH(float, Slong)
-  C_ARITH(float, Ulong)
-
-  C_ARITH(Slong, int)
-  C_ARITH(Slong, unsigned)
-  C_ARITH(Ulong, int)
-  C_ARITH(Ulong, unsigned)
-
-  template<typename T>
-  struct map {
-    typedef T t;
-  };
-  template<typename T>
-  struct c_type_params {
-    // will error out for T for which this template struct is not specialized
-  };
-
-  template<typename T> inline const char *c_type_name() { return "unknown"; }
-  template<> inline const char *c_type_name<bool>() { return "bool";}
-  template<> inline const char *c_type_name<char>() { return "char";}
-  template<> inline const char *c_type_name<signed char>() { return "signed char";}
-  template<> inline const char *c_type_name<unsigned char>() { return "unsigned char";}
-  template<> inline const char *c_type_name<signed short>() { return "signed short";}
-  template<> inline const char *c_type_name<unsigned short>() { return "unsigned short";}
-  template<> inline const char *c_type_name<int>() { return "int";}
-  template<> inline const char *c_type_name<unsigned>() { return "unsigned";}
-  template<> inline const char *c_type_name<signed long>() { return "signed long";}
-  template<> inline const char *c_type_name<unsigned long>() { return "unsigned long";}
-  template<> inline const char *c_type_name<signed long long>() { return "signed long long";}
-  template<> inline const char *c_type_name<unsigned long long>() { return "unsigned long long";}
-  template<> inline const char *c_type_name<float>() { return "float";}
-  template<> inline const char *c_type_name<double>() { return "double";}
-
-  template<typename T> struct c_type;
-
-  template<typename T>
-  struct rt_c_type_T {
-    template<typename T2>
-    struct op1 {
-      typedef typename T::template rt_T< c_type<T2> >::mult mult;
-      typedef typename T::template rt_T< c_type<T2> >::plus plus;
-      typedef typename T::template rt_T< c_type<T2> >::minus2 minus;
-      typedef typename T::template rt_T< c_type<T2> >::minus minus2;
-      typedef typename T::template rt_T< c_type<T2> >::logic logic;
-      typedef typename T::template rt_T< c_type<T2> >::div2 div;
-      typedef typename T::template rt_T< c_type<T2> >::div div2;
-    };
-  };
-  template<typename T>
-  struct c_type {
-    typedef typename c_prom<T>::promoted_type c_prom_T;
-    struct rt_unary {
-      typedef c_prom_T neg;
-      typedef c_prom_T mag_sqr;
-      typedef c_prom_T mag;
-      template<unsigned N>
-      struct set {
-        typedef c_prom_T sum;
-      };
-    };
-    template<typename T2>
-    struct rt_T {
-      typedef typename rt_c_type_T<T2>::template op1<T>::mult mult;
-      typedef typename rt_c_type_T<T2>::template op1<T>::plus plus;
-      typedef typename rt_c_type_T<T2>::template op1<T>::minus minus;
-      typedef typename rt_c_type_T<T2>::template op1<T>::minus2 minus2;
-      typedef typename rt_c_type_T<T2>::template op1<T>::logic logic;
-      typedef typename rt_c_type_T<T2>::template op1<T>::div div;
-      typedef typename rt_c_type_T<T2>::template op1<T>::div2 div2;
-    };
-    inline static std::string type_name() {
-      std::string r = c_type_name<T>();
-      return r;
-    }
-
-  };
-  // with T == c_type
-  template<typename T>
-  struct rt_c_type_T< c_type<T> > {
-    typedef typename c_prom<T>::promoted_type c_prom_T;
-    template<typename T2>
-    struct op1 {
-      typedef typename c_prom<T2>::promoted_type c_prom_T2;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv mult;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv plus;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus2;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv logic;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div;
-      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div2;
-    };
-  };
-
-  #define C_TYPE_MAP(C_TYPE) \
-  template<> struct map<C_TYPE> { \
-    typedef c_type<C_TYPE> t; \
-  };
-
-  #define C_TYPE_PARAMS(C_TYPE, WI, SI) \
-  template<> struct c_type_params<C_TYPE> { \
-    enum { W = WI, I = WI, E = 0, S = SI, floating_point = 0 }; \
-  };
-
-  #define C_TYPE_MAP_INT(C_TYPE, WI, SI) \
-    C_TYPE_MAP(C_TYPE) \
-    C_TYPE_PARAMS(C_TYPE, WI, SI)
-
-  #define C_TYPE_MAP_FLOAT(C_TYPE, FP, WFP, IFP, EFP) \
-  C_TYPE_MAP(C_TYPE) \
-  template<> struct c_type_params<C_TYPE> { \
-    enum { W = WFP, I = IFP, E = EFP, S = true, floating_point = FP }; \
-  };
-
-  C_TYPE_MAP_INT(bool, 1, false)
-  C_TYPE_MAP_INT(char, 8, true)
-  C_TYPE_MAP_INT(signed char, 8, true)
-  C_TYPE_MAP_INT(unsigned char, 8, false)
-  C_TYPE_MAP_INT(signed short, 16, true)
-  C_TYPE_MAP_INT(unsigned short, 16, false)
-  C_TYPE_MAP_INT(signed int, 32, true)
-  C_TYPE_MAP_INT(unsigned int, 32, false)
-  C_TYPE_MAP_INT(signed long, ac_private::long_w, true)
-  C_TYPE_MAP_INT(unsigned long, ac_private::long_w, false)
-  C_TYPE_MAP_INT(signed long long, 64, true)
-  C_TYPE_MAP_INT(unsigned long long, 64, false)
-  C_TYPE_MAP_FLOAT(float, 1, 25, 1, 8)
-  C_TYPE_MAP_FLOAT(double, 2, 54, 1, 11)
-
-  #undef C_TYPE_INT
-  #undef C_TYPE_PARAMS
-  #undef C_TYPE_FLOAT
-  #undef C_TYPE_MAP
-
-  // specializations for following struct declared/defined after definition of ac_int
-  template<typename T>
-  struct rt_ac_int_T {
-    template<int W, bool S>
-    struct op1 {
-      typedef typename T::template rt_T< ac_int<W,S> >::mult mult;
-      typedef typename T::template rt_T< ac_int<W,S> >::plus plus;
-      typedef typename T::template rt_T< ac_int<W,S> >::minus2 minus;
-      typedef typename T::template rt_T< ac_int<W,S> >::minus minus2;
-      typedef typename T::template rt_T< ac_int<W,S> >::logic logic;
-      typedef typename T::template rt_T< ac_int<W,S> >::div2 div;
-      typedef typename T::template rt_T< ac_int<W,S> >::div div2;
-    };
-  };
-}
-
-namespace ac {
-  // compiler time constant for log2 like functions
-  template<unsigned X>
-  struct nbits {
-    enum { val = X ? ac_private::s_N<16>::s_X<X>::nbits : 1 };
-  };
-
-  template<unsigned X>
-  struct log2_floor {
-    enum { val = nbits<X>::val - 1 };
-  };
-
-  // log2 of 0 is not defined: generate compiler error
-  template<> struct log2_floor<0> {};
-
-  template<unsigned X>
-  struct log2_ceil {
-    enum { lf = log2_floor<X>::val, val = (X == (1 << lf) ? lf : lf+1) };
-  };
-
-  // log2 of 0 is not defined: generate compiler error
-  template<> struct log2_ceil<0> {};
-
-  template<int LowerBound, int UpperBound>
-  struct int_range {
-    enum { l_s = (LowerBound < 0), u_s = (UpperBound < 0),
-           signedness = l_s || u_s,
-           l_nbits = nbits<AC_ABS(LowerBound+l_s)+l_s>::val,
-           u_nbits = nbits<AC_ABS(UpperBound+u_s)+u_s>::val,
-           nbits = AC_MAX(l_nbits, u_nbits + (!u_s && signedness))
-         };
-    typedef ac_int<nbits, signedness> type;
-  };
-
-  template<int W, int P, bool Is_MSB, bool S>
-  class sliceref {
-# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-# pragma builtin
-# endif
-    int *d_iv;
-    template<int W2, int P2, bool Is_MSB2, bool S2> friend class sliceref;
-  public:
-    sliceref( int *iv ) : d_iv(iv) {}
-
-    inline const sliceref operator = ( const sliceref &val ) {
-      return operator=<P,Is_MSB,S>(val);
-    }
-
-    template<int P2, bool Is_MSB2, bool S2>
-    inline const sliceref operator = ( const sliceref<W,P2,Is_MSB2,S2> &val ) {
-      const int src_lsi = P2/32;
-      const int src_msi = (P2+W-1)/32;
-      const int trg_lsi = P/32;
-      const int trg_msi = (P+W-1)/32;
-      const int trg_lsb = P&31;
-      const int trg_msb = (P+W-1)&31;
-      const int N = src_msi-src_lsi+1;
-      const int Nr = trg_msi-trg_lsi+1;
-      const int rshift = (P2&31) - (P&31);
-      int shifted_src[Nr];
-      int *aligned_src = val.d_iv+src_lsi;
-      if(rshift) {
-        if(rshift < 0)
-          ac_private::iv_shift_l<N,Nr>(aligned_src, -rshift, shifted_src);
-        else
-          ac_private::iv_shift_r<N,Nr>(aligned_src, rshift, shifted_src);
-        aligned_src = shifted_src;
-      }
-      unsigned mask_lsi = ac_private::all_ones << trg_lsb;
-      unsigned mask_msi = ac_private::all_ones >> (31-trg_msb);
-      if(Nr==1)
-        mask_lsi &= mask_msi;
-      int *v = d_iv+trg_lsi;
-      v[0] ^= (v[0] ^ ((unsigned) aligned_src[0])) & mask_lsi;
-      for(int k=1; k < Nr-1; k++)
-        v[k] = aligned_src[k];
-      if(Nr > 1)
-        v[Nr-1] ^= (v[Nr-1] ^ ((unsigned) aligned_src[Nr-1])) & mask_msi;
-      if(Is_MSB) {
-        const unsigned rem = 31-trg_msb;
-        if(rem) {
-          v[Nr-1] =  S ? ((signed) ((unsigned) v[Nr-1]  << rem) >> rem)
-                       : ((unsigned) v[Nr-1]  << rem) >> rem;
-        } else if(!S) {
-          v[Nr] = 0;
-        }
-      }
-      return *this;
-    }
-  };
-}
-
-enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
-enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
-template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> class ac_fixed;
-
-//////////////////////////////////////////////////////////////////////////////
-//  Arbitrary-Length Integer: ac_int
-//////////////////////////////////////////////////////////////////////////////
-
-template<int W, bool S=true>
-class ac_int : public ac_private::iv_conv<(W+31+!S)/32, S, W<=64>
-#ifndef __SYNTHESIS__
-__AC_INT_UTILITY_BASE
-#endif
-{
-#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-#pragma builtin
-#endif
-
-  enum {N=(W+31+!S)/32};
-  typedef ac_private::iv_conv<N, S, W <= 64> ConvBase;
-  typedef ac_private::iv<N>                  Base;
-
-  inline void bit_adjust() {
-    const unsigned rem = (32-W)&31;
-    Base::v[N-1] =  S ? ((signed) ((unsigned) Base::v[N-1]  << rem) >> rem) : (rem ?
-                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
-  }
-
-  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
-
-  // returns false if number is denormal
-  template<int WE, bool SE>
-  bool normalize_private(ac_int<WE,SE> &exp, bool reserved_min_exp=false) {
-    int expt = exp;
-    int lshift = leading_sign();
-    bool fully_normalized = true;
-    ac_int<WE, SE> min_exp;
-    min_exp.template set_val<AC_VAL_MIN>();
-    int max_shift = exp - min_exp - reserved_min_exp;
-    if(lshift > max_shift) {
-      lshift = ac_int<WE,false>(max_shift);
-      expt = min_exp + reserved_min_exp;
-      fully_normalized = false;
-    } else {
-      expt -= lshift;
-    }
-    if(Base::equal_zero()) {
-      expt = 0;
-      fully_normalized = true;
-    }
-    exp = expt;
-    Base r;
-    Base::shift_l(lshift, r);
-    Base::operator=(r);
-    bit_adjust();
-    return fully_normalized;
-  }
-
-public:
-  static const int width = W;
-  static const int i_width = W;
-  static const bool sign = S;
-  static const ac_q_mode q_mode = AC_TRN;
-  static const ac_o_mode o_mode = AC_WRAP;
-  static const int e_width = 0;
-
-  template<int W2, bool S2>
-  struct rt {
-    enum {
-      mult_w = W+W2,
-      mult_s = S||S2,
-      plus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
-      plus_s = S||S2,
-      minus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
-      minus_s = true,
-      div_w = W+S2,
-      div_s = S||S2,
-      mod_w = AC_MIN(W,W2+(!S2&&S)),
-      mod_s = S,
-      logic_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2)),
-      logic_s = S||S2
-    };
-    typedef ac_int<mult_w, mult_s> mult;
-    typedef ac_int<plus_w, plus_s> plus;
-    typedef ac_int<minus_w, minus_s> minus;
-    typedef ac_int<logic_w, logic_s> logic;
-    typedef ac_int<div_w, div_s> div;
-    typedef ac_int<mod_w, mod_s> mod;
-    typedef ac_int<W, S> arg1;
-  };
-
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::mult mult;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::plus plus;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus minus;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus2 minus2;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::logic logic;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div div;
-    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div2 div2;
-    typedef ac_int<W, S> arg1;
-  };
-
-  struct rt_unary {
-    enum {
-      neg_w = W+1,
-      neg_s = true,
-      mag_sqr_w = 2*W-S,
-      mag_sqr_s = false,
-      mag_w = W+S,
-      mag_s = false,
-      leading_sign_w = ac::log2_ceil<W+!S>::val,
-      leading_sign_s = false
-    };
-    typedef ac_int<neg_w, neg_s> neg;
-    typedef ac_int<mag_sqr_w, mag_sqr_s> mag_sqr;
-    typedef ac_int<mag_w, mag_s> mag;
-    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
-    template<unsigned N>
-    struct set {
-      enum { sum_w = W + ac::log2_ceil<N>::val, sum_s = S};
-      typedef ac_int<sum_w, sum_s> sum;
-    };
-  };
-
-  template<int W2, bool S2> friend class ac_int;
-  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
-  ac_int() {
-#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
-    bit_adjust();
-#endif
-  }
-  template<int W2, bool S2>
-  inline ac_int (const ac_int<W2,S2> &op) {
-    Base::operator =(op);
-    bit_adjust();
-  }
-
-  inline ac_int( bool b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( char b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed char b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned char b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed short b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned short b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed int b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned int b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( signed long b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( unsigned long b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( Slong b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( Ulong b ) : ConvBase(b) { bit_adjust(); }
-  inline ac_int( double d ) : ConvBase(d) { bit_adjust(); }
-
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( push )
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-  template<ac_special_val V>
-  inline ac_int &set_val() {
-    const unsigned int all_ones = (unsigned) ~0;
-    if(V == AC_VAL_DC) {
-      ac_int r;
-      Base::operator =(r);
-      bit_adjust();
-    }
-    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-      Base::operator =(0);
-      if(S && V == AC_VAL_MIN) {
-        const unsigned int rem = (W-1)&31;
-        Base::v[N-1] = (all_ones << rem);
-      } else if(V == AC_VAL_QUANTUM)
-        Base::v[0] = 1;
-    }
-    else {  // AC_VAL_MAX
-      Base::operator =(-1);
-      const unsigned int rem = (32-W - !S )&31;
-      Base::v[N-1] = (all_ones >> 1) >> rem;
-    }
-    return *this;
-  }
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-  // Explicit conversion functions to C built-in types -------------
-  inline int to_int() const { return Base::v[0]; }
-  inline unsigned to_uint() const { return Base::v[0]; }
-  inline long to_long() const {
-    return ac_private::long_w == 32 ? (long) Base::v[0] : (long) Base::to_int64();
-  }
-  inline unsigned long to_ulong() const {
-    return ac_private::long_w == 32 ? (unsigned long) Base::v[0] : (unsigned long) Base::to_uint64();
-  }
-  inline Slong to_int64() const { return Base::to_int64(); }
-  inline Ulong to_uint64() const { return Base::to_uint64(); }
-  inline double to_double() const { return Base::to_double(); }
-
-  inline int length() const { return W; }
-
-  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
-    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
-    char r[N*32+4] = {0};
-    int i = 0;
-    if(sign_mag)
-      r[i++] = is_neg() ? '-' : '+';
-    else if (base_rep == AC_DEC && is_neg())
-      r[i++] = '-';
-    if(base_rep != AC_DEC) {
-      r[i++] = '0';
-      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
-    }
-    int str_w;
-    if( (base_rep == AC_DEC || sign_mag) && is_neg() ) {
-      ac_int<W, false>  mag = operator -();
-      str_w = ac_private::to_string(mag.v, W+1, sign_mag, base_rep, false, r+i);
-    } else {
-      ac_int<W,S> tmp = *this;
-      str_w = ac_private::to_string(tmp.v, W+!S, sign_mag, base_rep, false, r+i);
-    }
-    if(!str_w) {
-      r[i] = '0';
-      r[i+1] = 0;
-    }
-    return std::string(r);
-  }
-  inline static std::string type_name() {
-    const char *tf[] = {",false>", ",true>"};
-    std::string r = "ac_int<";
-    r += ac_int<32,true>(W).to_string(AC_DEC);
-    r += tf[S];
-    return r;
-  }
-
-  // Arithmetic : Binary ----------------------------------------------------
-  template<int W2, bool S2>
-  typename rt<W2,S2>::mult operator *( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::mult r;
-    Base::mult(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::plus operator +( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::plus r;
-    Base::add(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::minus operator -( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::minus r;
-    Base::sub(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wenum-compare"
-#endif
-  template<int W2, bool S2>
-  typename rt<W2,S2>::div operator /( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::div r;
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::div::N };
-    Base::template div<num_s, N2, den_s, Nr>(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::mod operator %( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::mod r;
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::mod::N };
-    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
-    return r;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-  // Arithmetic assign  ------------------------------------------------------
-  template<int W2, bool S2>
-  ac_int &operator *=( const ac_int<W2,S2> &op2) {
-    Base r;
-    Base::mult(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator +=( const ac_int<W2,S2> &op2) {
-    Base r;
-    Base::add(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator -=( const ac_int<W2,S2> &op2) {
-    Base r;
-    Base::sub(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wenum-compare"
-#endif
-  template<int W2, bool S2>
-  ac_int &operator /=( const ac_int<W2,S2> &op2) {
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
-    Base r;
-    Base::template div<num_s, N2, den_s, Nr>(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator %=( const ac_int<W2,S2> &op2) {
-    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
-          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
-    Base r;
-    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-  // Arithmetic prefix increment, decrement ----------------------------------
-  ac_int &operator ++() {
-    Base::increment();
-    bit_adjust();
-    return *this;
-  }
-  ac_int &operator --() {
-    Base::decrement();
-    bit_adjust();
-    return *this;
-  }
-  // Arithmetic postfix increment, decrement ---------------------------------
-  const ac_int operator ++(int) {
-    ac_int t = *this;
-    Base::increment();
-    bit_adjust();
-    return t;
-  }
-  const ac_int operator --(int) {
-    ac_int t = *this;
-    Base::decrement();
-    bit_adjust();
-    return t;
-  }
-  // Arithmetic Unary --------------------------------------------------------
-  ac_int operator +() {
-    return *this;
-  }
-  typename rt_unary::neg operator -() const {
-    typename rt_unary::neg r;
-    Base::neg(r);
-    r.bit_adjust();
-    return r;
-  }
-  // ! ------------------------------------------------------------------------
-  bool operator ! () const {
-    return Base::equal_zero();
-  }
-
-  // Bitwise (arithmetic) unary: complement  -----------------------------
-  ac_int<W+!S, true> operator ~() const {
-    ac_int<W+!S, true> r;
-    Base::bitwise_complement(r);
-    return r;
-  }
-  // Bitwise (non-arithmetic) bit_complement  -----------------------------
-  ac_int<W, false> bit_complement() const {
-    ac_int<W, false> r;
-    Base::bitwise_complement(r);
-    r.bit_adjust();
-    return r;
-  }
-  // Bitwise (arithmetic): and, or, xor ----------------------------------
-  template<int W2, bool S2>
-  typename rt<W2,S2>::logic operator & ( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::logic r;
-    Base::bitwise_and(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::logic operator | ( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::logic r;
-    Base::bitwise_or(op2, r);
-    return r;
-  }
-  template<int W2, bool S2>
-  typename rt<W2,S2>::logic operator ^ ( const ac_int<W2,S2> &op2) const {
-    typename rt<W2,S2>::logic r;
-    Base::bitwise_xor(op2, r);
-    return r;
-  }
-  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
-  template<int W2, bool S2>
-  ac_int &operator &= ( const ac_int<W2,S2> &op2 ) {
-    Base r;
-    Base::bitwise_and(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator |= ( const ac_int<W2,S2> &op2 ) {
-    Base r;
-    Base::bitwise_or(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2, bool S2>
-  ac_int &operator ^= ( const ac_int<W2,S2> &op2 ) {
-    Base r;
-    Base::bitwise_xor(op2, r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  // Shift (result constrained by left operand) -------------------------------
-  template<int W2>
-  ac_int operator << ( const ac_int<W2,true> &op2 ) const {
-    ac_int r;
-    Base::shift_l2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_int operator << ( const ac_int<W2,false> &op2 ) const {
-    ac_int r;
-    Base::shift_l(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_int operator >> ( const ac_int<W2,true> &op2 ) const {
-    ac_int r;
-    Base::shift_r2(op2.to_int(), r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int W2>
-  ac_int operator >> ( const ac_int<W2,false> &op2 ) const {
-    ac_int r;
-    Base::shift_r(op2.to_uint(), r);
-    r.bit_adjust();
-    return r;
-  }
-  // Shift assign ------------------------------------------------------------
-  template<int W2>
-  ac_int &operator <<= ( const ac_int<W2,true> &op2 ) {
-    Base r;
-    Base::shift_l2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_int &operator <<= ( const ac_int<W2,false> &op2 ) {
-    Base r;
-    Base::shift_l(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_int &operator >>= ( const ac_int<W2,true> &op2 ) {
-    Base r;
-    Base::shift_r2(op2.to_int(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  template<int W2>
-  ac_int &operator >>= ( const ac_int<W2,false> &op2 ) {
-    Base r;
-    Base::shift_r(op2.to_uint(), r);
-    Base::operator=(r);
-    bit_adjust();
-    return *this;
-  }
-  // Relational ---------------------------------------------------------------
-  template<int W2, bool S2>
-  bool operator == ( const ac_int<W2,S2> &op2) const {
-    return Base::equal(op2);
-  }
-  template<int W2, bool S2>
-  bool operator != ( const ac_int<W2,S2> &op2) const {
-    return !Base::equal(op2);
-  }
-  template<int W2, bool S2>
-  bool operator < ( const ac_int<W2,S2> &op2) const {
-    return Base::less_than(op2);
-  }
-  template<int W2, bool S2>
-  bool operator >= ( const ac_int<W2,S2> &op2) const {
-    return !Base::less_than(op2);
-  }
-  template<int W2, bool S2>
-  bool operator > ( const ac_int<W2,S2> &op2) const {
-    return Base::greater_than(op2);
-  }
-  template<int W2, bool S2>
-  bool operator <= ( const ac_int<W2,S2> &op2) const {
-    return !Base::greater_than(op2);
-  }
-
-  // Bit and Slice Select -----------------------------------------------------
-  template<int WS, int WX, bool SX>
-  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int WS>
-  inline const ac_int<WS,S> slc(signed index) const {
-    ac_int<WS,S> r;
-    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-  template<int WS>
-  inline const ac_int<WS,S> slc(unsigned uindex) const {
-    ac_int<WS,S> r;
-    Base::shift_r(uindex, r);
-    r.bit_adjust();
-    return r;
-  }
-
-  template<int W2, bool S2, int WX, bool SX>
-  inline ac_int &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_int &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else {
-      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    }
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-  template<int W2, bool S2>
-  inline ac_int &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
-    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
-    if(W == W2)
-      Base::operator =(slc);
-    else
-      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
-    bit_adjust();   // in case sign bit was assigned
-    return *this;
-  }
-
-  template<int Msb, int Lsb>
-  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
-    #if __cplusplus > 199711L
-    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
-    static_assert(Lsb >= 0, "LSB is negative");
-    static_assert(Msb < W, "MSB >= W");
-    #endif
-    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
-  }
-
-  class ac_bitref {
-# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
-# pragma builtin
-# endif
-    ac_int &d_bv;
-    unsigned d_index;
-  public:
-    ac_bitref( ac_int *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
-    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
-
-    template<int W2, bool S2>
-    operator ac_int<W2,S2> () const { return operator bool (); }
-
-    inline ac_bitref operator = ( int val ) {
-      // lsb of int (val&1) is written to bit
-      if(d_index < W) {
-        int *pval = &d_bv.v[d_index>>5];
-        *pval ^= (*pval ^ ( (unsigned) val << (d_index&31) )) & 1 << (d_index&31);
-        d_bv.bit_adjust();   // in case sign bit was assigned
-      }
-      return *this;
-    }
-    template<int W2, bool S2>
-    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
-      return operator =(val.to_int());
-    }
-    inline ac_bitref operator = ( const ac_bitref &val ) {
-      return operator =((int) (bool) val);
-    }
-  };
-
-  ac_bitref operator [] ( unsigned int uindex) {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  ac_bitref operator [] ( int index) {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  template<int W2, bool S2>
-  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    ac_bitref bvh( this, uindex );
-    return bvh;
-  }
-  bool operator [] ( unsigned int uindex) const {
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  bool operator [] ( int index) const {
-    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = index & ((unsigned)~0 >> 1);
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-  template<int W2, bool S2>
-  bool operator [] ( const ac_int<W2,S2> &index) const {
-    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
-    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
-    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
-    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
-  }
-
-  typename rt_unary::leading_sign leading_sign() const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    return ls;
-  }
-  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
-    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
-    all_sign = (ls == W-S);
-    return ls;
-  }
-  // returns false if number is denormal
-  template<int WE, bool SE>
-  bool normalize(ac_int<WE,SE> &exp) {
-    return normalize_private(exp);
-  }
-  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
-  template<int WE, bool SE>
-  bool normalize_RME(ac_int<WE,SE> &exp) {
-    return normalize_private(exp, true);
-  }
-  bool and_reduce() const {
-    return ac_private::iv_equal_ones_to<W,N>(Base::v);
-  }
-  bool or_reduce() const {
-    return !Base::equal_zero();
-  }
-  bool xor_reduce() const {
-    unsigned r = Base::v[N-1];
-    if(S) {
-      const unsigned rem = (32-W)&31;
-      r = (r << rem) >> rem;
-    }
-    if(N > 1)
-      r ^= Base::v[N-2];
-    if(N > 2) {
-      for(int i=0; i<N-2; i++)
-        r ^= Base::v[i];
-    }
-    if(W > 16)
-      r ^= r >> 16;
-    if(W > 8)
-      r ^= r >> 8;
-    if(W > 4)
-      r ^= r >> 4;
-    if(W > 2)
-      r ^= r >> 2;
-    if(W > 1)
-    r ^= r >> 1;
-    return r&1;
-  }
-
-  inline void bit_fill_hex(const char *str) {
-    // Zero Pads if str is too short, throws ms bits away if str is too long
-    // Asserts if anything other than 0-9a-fA-F is encountered
-    ac_int<W,S> res = 0;
-    while(*str) {
-      char c = *str;
-      int h = 0;
-      if(c >= '0' && c <= '9')
-        h = c - '0';
-      else if(c >= 'A' && c <= 'F')
-        h = c - 'A' + 10;
-      else if(c >= 'a' && c <= 'f')
-        h = c - 'a' + 10;
-      else {
-        AC_ASSERT(!c, "Invalid hex digit");
-        break;
-      }
-      res <<= ac_int<3,false>(4);
-      res |= ac_int<4,false>(h);
-      str++;
-    }
-    *this = res;
-  }
-
-  template<int Na>
-  inline void bit_fill(const int (&ivec)[Na], bool bigendian=true) {
-    // bit_fill from integer vector
-    //   if W > N*32, missing most significant bits are zeroed
-    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
-    // Example:
-    //   ac_int<80,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
-    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
-    enum { N0 = (W+31)/32, M = AC_MIN(N0,Na) };
-    ac_int<M*32,false> res = 0;
-    for(int i=0; i < M; i++)
-      res.set_slc(i*32, ac_int<32>(ivec[bigendian ? M-1-i : i]));
-    *this = res;
-  }
-};
-
-namespace ac {
-  template<typename T, typename T2>
-  struct rt_2T {
-    typedef typename ac_private::map<T>::t map_T;
-    typedef typename ac_private::map<T2>::t map_T2;
-    typedef typename map_T::template rt_T< map_T2 >::mult mult;
-    typedef typename map_T::template rt_T< map_T2 >::plus plus;
-    typedef typename map_T::template rt_T< map_T2 >::minus minus;
-    typedef typename map_T::template rt_T< map_T2 >::minus2 minus2;
-    typedef typename map_T::template rt_T< map_T2 >::logic logic;
-    typedef typename map_T::template rt_T< map_T2 >::div div;
-    typedef typename map_T::template rt_T< map_T2 >::div2 div2;
-  };
-}
-
-namespace ac {
-  template<typename T>
-  struct ac_int_represent {
-    enum { t_w = ac_private::c_type_params<T>::W, t_s = ac_private::c_type_params<T>::S };
-    typedef ac_int<t_w,t_s> type;
-  };
-  template<> struct ac_int_represent<float> {};
-  template<> struct ac_int_represent<double> {};
-  template<int W, bool S>
-  struct ac_int_represent< ac_int<W,S> > {
-    typedef ac_int<W,S> type;
-  };
-}
-
-namespace ac_private {
-  template<int W2, bool S2>
-  struct rt_ac_int_T< ac_int<W2,S2> > {
-    typedef ac_int<W2,S2> i2_t;
-    template<int W, bool S>
-    struct op1 {
-      typedef ac_int<W,S> i_t;
-      typedef typename i_t::template rt<W2,S2>::mult mult;
-      typedef typename i_t::template rt<W2,S2>::plus plus;
-      typedef typename i_t::template rt<W2,S2>::minus minus;
-      typedef typename i2_t::template rt<W,S>::minus minus2;
-      typedef typename i_t::template rt<W2,S2>::logic logic;
-      typedef typename i_t::template rt<W2,S2>::div div;
-      typedef typename i2_t::template rt<W,S>::div div2;
-      typedef typename i_t::template rt<W2,S2>::mod mod;
-      typedef typename i2_t::template rt<W,S>::mod mod2;
-    };
-  };
-
-  template<typename T>
-  struct rt_ac_int_T< c_type<T> > {
-    typedef typename ac::ac_int_represent<T>::type i2_t;
-    enum { W2 = i2_t::width, S2 = i2_t::sign };
-    template<int W, bool S>
-    struct op1 {
-      typedef ac_int<W,S> i_t;
-      typedef typename i_t::template rt<W2,S2>::mult mult;
-      typedef typename i_t::template rt<W2,S2>::plus plus;
-      typedef typename i_t::template rt<W2,S2>::minus minus;
-      typedef typename i2_t::template rt<W,S>::minus minus2;
-      typedef typename i_t::template rt<W2,S2>::logic logic;
-      typedef typename i_t::template rt<W2,S2>::div div;
-      typedef typename i2_t::template rt<W,S>::div div2;
-      typedef typename i_t::template rt<W2,S2>::mod mod;
-      typedef typename i2_t::template rt<W,S>::mod mod2;
-    };
-  };
-}
-
-
-// Specializations for constructors on integers that bypass bit adjusting
-//  and are therefore more efficient
-template<> inline ac_int<1,true>::ac_int( bool b ) { v[0] = b ? -1 : 0; }
-
-template<> inline ac_int<1,false>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<1,false>::ac_int( signed char b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned char b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( signed short b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned short b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( signed int b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned int b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( signed long b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( unsigned long b ) { v[0] = b&1; }
-template<> inline ac_int<1,false>::ac_int( Ulong b ) { v[0] = (int) b&1; }
-template<> inline ac_int<1,false>::ac_int( Slong b ) { v[0] = (int) b&1; }
-
-template<> inline ac_int<8,true>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<8,false>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<8,true>::ac_int( signed char b ) { v[0] = b; }
-template<> inline ac_int<8,false>::ac_int( unsigned char b ) { v[0] = b; }
-template<> inline ac_int<8,true>::ac_int( unsigned char b ) { v[0] = (signed char) b; }
-template<> inline ac_int<8,false>::ac_int( signed char b ) { v[0] = (unsigned char) b; }
-
-template<> inline ac_int<16,true>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( bool b ) { v[0] = b; }
-template<> inline ac_int<16,true>::ac_int( signed char b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( unsigned char b ) { v[0] = b; }
-template<> inline ac_int<16,true>::ac_int( unsigned char b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( signed char b ) { v[0] = (unsigned short) b; }
-template<> inline ac_int<16,true>::ac_int( signed short b ) { v[0] = b; }
-template<> inline ac_int<16,false>::ac_int( unsigned short b ) { v[0] = b; }
-template<> inline ac_int<16,true>::ac_int( unsigned short b ) { v[0] = (signed short) b; }
-template<> inline ac_int<16,false>::ac_int( signed short b ) { v[0] = (unsigned short) b; }
-
-template<> inline ac_int<32,true>::ac_int( signed int b ) { v[0] = b; }
-template<> inline ac_int<32,true>::ac_int( unsigned int b ) { v[0] = b; }
-template<> inline ac_int<32,false>::ac_int( signed int b ) { v[0] = b; v[1] = 0;}
-template<> inline ac_int<32,false>::ac_int( unsigned int b ) { v[0] = b; v[1] = 0;}
-
-template<> inline ac_int<32,true>::ac_int( Slong b ) { v[0] = (int) b; }
-template<> inline ac_int<32,true>::ac_int( Ulong b ) { v[0] = (int) b; }
-template<> inline ac_int<32,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = 0;}
-template<> inline ac_int<32,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = 0;}
-
-template<> inline ac_int<64,true>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
-template<> inline ac_int<64,true>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
-template<> inline ac_int<64,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
-template<> inline ac_int<64,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
-
-// Stream --------------------------------------------------------------------
-
-template<int W, bool S>
-inline std::ostream& operator << (std::ostream &os, const ac_int<W,S> &x) {
-#ifndef __SYNTHESIS__
-  if ((os.flags() & std::ios::hex) != 0) {
-    os << x.to_string(AC_HEX);
-  } else if ((os.flags() & std::ios::oct) != 0) {
-    os << x.to_string(AC_OCT);
-  } else {
-    os << x.to_string(AC_DEC);
-  }
-#endif
-  return os;
-}
-
-// Macros for Binary Operators with Integers --------------------------------------------
-
-#define BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
-  template<int W, bool S> \
-  inline typename ac_int<WI,SI>::template rt<W,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_int<W,S> &op) {  \
-    return ac_int<WI,SI>(i_op).operator BIN_OP (op);  \
-  } \
-  template<int W, bool S>   \
-  inline typename ac_int<W,S>::template rt<WI,SI>::RTYPE operator BIN_OP ( const ac_int<W,S> &op, C_TYPE i_op) {  \
-    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
-  }
-
-#define REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
-  template<int W, bool S>   \
-  inline bool operator REL_OP ( const ac_int<W,S> &op, C_TYPE op2) {  \
-    return op.operator REL_OP (ac_int<W2,S2>(op2));  \
-  }  \
-  template<int W, bool S> \
-  inline bool operator REL_OP ( C_TYPE op2, const ac_int<W,S> &op) {  \
-    return ac_int<W2,S2>(op2).operator REL_OP (op);  \
-  }
-
-#define ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, W2, S2)  \
-  template<int W, bool S>   \
-  inline ac_int<W,S> &operator ASSIGN_OP ( ac_int<W,S> &op, C_TYPE op2) {  \
-    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
-  }
-
-#define OPS_WITH_INT(C_TYPE, WI, SI) \
-  BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
-  BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
-  BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
-  BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
-  BIN_OP_WITH_INT(%, C_TYPE, WI, SI, mod) \
-  BIN_OP_WITH_INT(>>, C_TYPE, WI, SI, arg1) \
-  BIN_OP_WITH_INT(<<, C_TYPE, WI, SI, arg1) \
-  BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
-  BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
-  BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
-  \
-  REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
-  REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
-  \
-  ASSIGN_OP_WITH_INT(+=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(-=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(*=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(/=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(%=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(>>=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(<<=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(&=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(|=, C_TYPE, WI, SI) \
-  ASSIGN_OP_WITH_INT(^=, C_TYPE, WI, SI)
-
-// ------------------------------------- End of Macros for Binary Operators with Integers
-
-// for backward compatability with v3.9.0 and earlier define following macro
-#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
-namespace ac {
-  namespace ops_with_other_types {
-#endif
-//  Mixed Operators with Integers  -----------------------------------------------
-OPS_WITH_INT(bool, 1, false)
-OPS_WITH_INT(char, 8, true)
-OPS_WITH_INT(signed char, 8, true)
-OPS_WITH_INT(unsigned char, 8, false)
-OPS_WITH_INT(short, 16, true)
-OPS_WITH_INT(unsigned short, 16, false)
-OPS_WITH_INT(int, 32, true)
-OPS_WITH_INT(unsigned int, 32, false)
-OPS_WITH_INT(long, ac_private::long_w, true)
-OPS_WITH_INT(unsigned long, ac_private::long_w, false)
-OPS_WITH_INT(Slong, 64, true)
-OPS_WITH_INT(Ulong, 64, false)
-// -----------------------------------------  End of Mixed Operators with Integers
-#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
-  }  // ops_with_other_types namespace
-}
-using namespace ac::ops_with_other_types;
-#endif
-
-namespace ac {
-  // Functions to fill bits
-
-  template<typename T>
-  inline T bit_fill_hex(const char *str) {
-    T res;
-    res.bit_fill_hex(str);
-    return res;
-  }
-
-  // returns bit_fill for type
-  //   example:
-  //   ac_int<80,false> x = ac::bit_fill< ac_int<80,false> > ((int [3]) {0xffffa987, 0x6543210f, 0xedcba987 });
-  template<typename T, int N>
-  inline T bit_fill(const int (&ivec)[N], bool bigendian=true) {
-    T res;
-    res.bit_fill(ivec, bigendian);
-    return res;
-  }
-
-}  // ac namespace
-
-//  Mixed Operators with Pointers  -----------------------------------------------
-
-// Addition of ac_int and  pointer
-template<typename T, int W, bool S>
-T *operator +(T *ptr, const ac_int<W,S> &op2) {
-  return ptr + op2.to_int64();
-}
-template<typename T, int W, bool S>
-T *operator +(const ac_int<W,S> &op2, T *ptr) {
-  return ptr + op2.to_int64();
-}
-// Subtraction of ac_int from pointer
-template<typename T, int W, bool S>
-T *operator -(T *ptr, const ac_int<W,S> &op2) {
-  return ptr - op2.to_int64();
-}
-// -----------------------------------------  End of Mixed Operators with Pointers
-
-namespace ac_intN {
-  ///////////////////////////////////////////////////////////////////////////////
-  //  Predefined for ease of use
-  ///////////////////////////////////////////////////////////////////////////////
-  typedef ac_int<1,          true>   int1;
-  typedef ac_int<1,          false>  uint1;
-  typedef ac_int<2,          true>   int2;
-  typedef ac_int<2,          false>  uint2;
-  typedef ac_int<3,          true>   int3;
-  typedef ac_int<3,          false>  uint3;
-  typedef ac_int<4,          true>   int4;
-  typedef ac_int<4,          false>  uint4;
-  typedef ac_int<5,          true>   int5;
-  typedef ac_int<5,          false>  uint5;
-  typedef ac_int<6,          true>   int6;
-  typedef ac_int<6,          false>  uint6;
-  typedef ac_int<7,          true>   int7;
-  typedef ac_int<7,          false>  uint7;
-  typedef ac_int<8,          true>   int8;
-  typedef ac_int<8,          false>  uint8;
-  typedef ac_int<9,          true>   int9;
-  typedef ac_int<9,          false>  uint9;
-  typedef ac_int<10,         true>   int10;
-  typedef ac_int<10,         false>  uint10;
-  typedef ac_int<11,         true>   int11;
-  typedef ac_int<11,         false>  uint11;
-  typedef ac_int<12,         true>   int12;
-  typedef ac_int<12,         false>  uint12;
-  typedef ac_int<13,         true>   int13;
-  typedef ac_int<13,         false>  uint13;
-  typedef ac_int<14,         true>   int14;
-  typedef ac_int<14,         false>  uint14;
-  typedef ac_int<15,         true>   int15;
-  typedef ac_int<15,         false>  uint15;
-  typedef ac_int<16,         true>   int16;
-  typedef ac_int<16,         false>  uint16;
-  typedef ac_int<17,         true>   int17;
-  typedef ac_int<17,         false>  uint17;
-  typedef ac_int<18,         true>   int18;
-  typedef ac_int<18,         false>  uint18;
-  typedef ac_int<19,         true>   int19;
-  typedef ac_int<19,         false>  uint19;
-  typedef ac_int<20,         true>   int20;
-  typedef ac_int<20,         false>  uint20;
-  typedef ac_int<21,         true>   int21;
-  typedef ac_int<21,         false>  uint21;
-  typedef ac_int<22,         true>   int22;
-  typedef ac_int<22,         false>  uint22;
-  typedef ac_int<23,         true>   int23;
-  typedef ac_int<23,         false>  uint23;
-  typedef ac_int<24,         true>   int24;
-  typedef ac_int<24,         false>  uint24;
-  typedef ac_int<25,         true>   int25;
-  typedef ac_int<25,         false>  uint25;
-  typedef ac_int<26,         true>   int26;
-  typedef ac_int<26,         false>  uint26;
-  typedef ac_int<27,         true>   int27;
-  typedef ac_int<27,         false>  uint27;
-  typedef ac_int<28,         true>   int28;
-  typedef ac_int<28,         false>  uint28;
-  typedef ac_int<29,         true>   int29;
-  typedef ac_int<29,         false>  uint29;
-  typedef ac_int<30,         true>   int30;
-  typedef ac_int<30,         false>  uint30;
-  typedef ac_int<31,         true>   int31;
-  typedef ac_int<31,         false>  uint31;
-  typedef ac_int<32,         true>   int32;
-  typedef ac_int<32,         false>  uint32;
-  typedef ac_int<33,         true>   int33;
-  typedef ac_int<33,         false>  uint33;
-  typedef ac_int<34,         true>   int34;
-  typedef ac_int<34,         false>  uint34;
-  typedef ac_int<35,         true>   int35;
-  typedef ac_int<35,         false>  uint35;
-  typedef ac_int<36,         true>   int36;
-  typedef ac_int<36,         false>  uint36;
-  typedef ac_int<37,         true>   int37;
-  typedef ac_int<37,         false>  uint37;
-  typedef ac_int<38,         true>   int38;
-  typedef ac_int<38,         false>  uint38;
-  typedef ac_int<39,         true>   int39;
-  typedef ac_int<39,         false>  uint39;
-  typedef ac_int<40,         true>   int40;
-  typedef ac_int<40,         false>  uint40;
-  typedef ac_int<41,         true>   int41;
-  typedef ac_int<41,         false>  uint41;
-  typedef ac_int<42,         true>   int42;
-  typedef ac_int<42,         false>  uint42;
-  typedef ac_int<43,         true>   int43;
-  typedef ac_int<43,         false>  uint43;
-  typedef ac_int<44,         true>   int44;
-  typedef ac_int<44,         false>  uint44;
-  typedef ac_int<45,         true>   int45;
-  typedef ac_int<45,         false>  uint45;
-  typedef ac_int<46,         true>   int46;
-  typedef ac_int<46,         false>  uint46;
-  typedef ac_int<47,         true>   int47;
-  typedef ac_int<47,         false>  uint47;
-  typedef ac_int<48,         true>   int48;
-  typedef ac_int<48,         false>  uint48;
-  typedef ac_int<49,         true>   int49;
-  typedef ac_int<49,         false>  uint49;
-  typedef ac_int<50,         true>   int50;
-  typedef ac_int<50,         false>  uint50;
-  typedef ac_int<51,         true>   int51;
-  typedef ac_int<51,         false>  uint51;
-  typedef ac_int<52,         true>   int52;
-  typedef ac_int<52,         false>  uint52;
-  typedef ac_int<53,         true>   int53;
-  typedef ac_int<53,         false>  uint53;
-  typedef ac_int<54,         true>   int54;
-  typedef ac_int<54,         false>  uint54;
-  typedef ac_int<55,         true>   int55;
-  typedef ac_int<55,         false>  uint55;
-  typedef ac_int<56,         true>   int56;
-  typedef ac_int<56,         false>  uint56;
-  typedef ac_int<57,         true>   int57;
-  typedef ac_int<57,         false>  uint57;
-  typedef ac_int<58,         true>   int58;
-  typedef ac_int<58,         false>  uint58;
-  typedef ac_int<59,         true>   int59;
-  typedef ac_int<59,         false>  uint59;
-  typedef ac_int<60,         true>   int60;
-  typedef ac_int<60,         false>  uint60;
-  typedef ac_int<61,         true>   int61;
-  typedef ac_int<61,         false>  uint61;
-  typedef ac_int<62,         true>   int62;
-  typedef ac_int<62,         false>  uint62;
-  typedef ac_int<63,         true>   int63;
-  typedef ac_int<63,         false>  uint63;
-}  // namespace ac_intN
-
-#ifndef AC_NOT_USING_INTN
-using namespace ac_intN;
-#endif
-
-///////////////////////////////////////////////////////////////////////////////
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( disable: 4700 )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wuninitialized"
-#endif
-
-// Global templatized functions for easy initialization to special values
-template<ac_special_val V, int W, bool S>
-inline ac_int<W,S> value(ac_int<W,S>) {
-  ac_int<W,S> r;
-  return r.template set_val<V>();
-}
-// forward declaration, otherwise GCC errors when calling init_array
-template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>);
-
-#define SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
-template<> inline C_TYPE value<AC_VAL_DC>(C_TYPE) { C_TYPE x; return x; }
-
-// -- C int types -----------------------------------------------------------------
-#define SPECIAL_VAL_FOR_INTS(C_TYPE, WI, SI) \
-template<ac_special_val val> inline C_TYPE value(C_TYPE); \
-template<> inline C_TYPE value<AC_VAL_0>(C_TYPE) { return (C_TYPE)0; } \
-SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
-template<> inline C_TYPE value<AC_VAL_QUANTUM>(C_TYPE) { return (C_TYPE)1; } \
-template<> inline C_TYPE value<AC_VAL_MAX>(C_TYPE) { return (C_TYPE)(SI ? ~(((C_TYPE) 1) << (WI-1)) : (C_TYPE) -1); } \
-template<> inline C_TYPE value<AC_VAL_MIN>(C_TYPE) { return (C_TYPE)(SI ? ((C_TYPE) 1) << (WI-1) : (C_TYPE) 0); }
-
-SPECIAL_VAL_FOR_INTS(bool, 1, false)
-SPECIAL_VAL_FOR_INTS(char, 8, true)
-SPECIAL_VAL_FOR_INTS(signed char, 8, true)
-SPECIAL_VAL_FOR_INTS(unsigned char, 8, false)
-SPECIAL_VAL_FOR_INTS(short, 16, true)
-SPECIAL_VAL_FOR_INTS(unsigned short, 16, false)
-SPECIAL_VAL_FOR_INTS(int, 32, true)
-SPECIAL_VAL_FOR_INTS(unsigned int, 32, false)
-SPECIAL_VAL_FOR_INTS(long, ac_private::long_w, true)
-SPECIAL_VAL_FOR_INTS(unsigned long, ac_private::long_w, false)
-SPECIAL_VAL_FOR_INTS(Slong, 64, true)
-SPECIAL_VAL_FOR_INTS(Ulong, 64, false)
-
-#define INIT_ARRAY_SPECIAL_VAL_FOR_INTS(C_TYPE) \
-  template<ac_special_val V> \
-  inline bool init_array(C_TYPE *a, int n) { \
-    C_TYPE t = value<V>((C_TYPE) 0); \
-    for(int i=0; i < n; i++) \
-      a[i] = t; \
-    return true; \
-  }
-
-namespace ac {
-// PUBLIC FUNCTIONS
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, int W, bool S>
-  inline bool init_array(ac_int<W,S> *a, int n) {
-    ac_int<W,S> t;
-    t.template set_val<V>();
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(bool)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(char)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed char)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned char)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed short)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned short)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed int)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned int)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long long)
-  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long long)
-}
-
-#if (defined(_MSC_VER) && !defined(__EDG__))
-#pragma warning( pop )
-#endif
-#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
-#pragma GCC diagnostic pop
-#endif
-#if defined(__clang__)
-#pragma clang diagnostic pop
-#endif
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#endif // __AC_INT_H
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2004-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*
+//  Source:         ac_int.h
+//  Description:    fast arbitrary-length bit-accurate integer types:
+//                    - unsigned integer of length W:  ac_int<W,false>
+//                    - signed integer of length W:  ac_int<W,true>
+//  Author:         Andres Takach, Ph.D.
+//  Notes:
+//   - C++ Runtime: important to use optimization flag (for example -O3)
+//
+//   - Compiler support: recent GNU compilers are required for correct
+//     template compilation
+//
+//   - Most frequent migration issues:
+//      - need to cast to common type when using question mark operator:
+//          (a < 0) ? -a : a;  // a is ac_int<W,true>
+//        change to:
+//          (a < 0) ? -a : (ac_int<W+1,true>) a;
+//        or
+//          (a < 0) ? (ac_int<W+1,false>) -a : (ac_int<W+1,false>) a;
+//
+//      - left shift is not arithmetic ("a<<n" has same bitwidth as "a")
+//          ac_int<W+1,false> b = a << 1;  // a is ac_int<W,false>
+//        is not equivalent to b=2*a. In order to get 2*a behavior change to:
+//          ac_int<W+1,false> b = (ac_int<W+1,false>)a << 1;
+//
+//      - only static length read/write slices are supported:
+//         - read:  x.slc<4>(k) => returns ac_int for 4-bit slice x(4+k-1 DOWNTO k)
+//         - write: x.set_slc(k,y) = writes bits of y to x starting at index k
+*/
+
+#ifndef __AC_INT_H
+#define __AC_INT_H
+
+#define AC_VERSION 3
+#define AC_VERSION_MINOR 9
+
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ < 3 && !defined(__EDG__))
+#error GCC version 3 or greater is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && _MSC_VER < 1400 && !defined(__EDG__))
+#error Microsoft Visual Studio 8 or newer is required to include this header file
+#endif
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4127 4100 4244 4307 4310 4365 4514 4554 4706 4800 )
+#endif
+
+// for safety
+#if (defined(N) || defined(N2))
+#error One or more of the following is defined: N, N2. Definition conflicts with their usage as template parameters.
+#error DO NOT use defines before including third party header files.
+#endif
+
+// for safety
+#if (defined(W) || defined(I) || defined(S) || defined(W2) || defined(I2) || defined(S2))
+#error One or more of the following is defined: W, I, S, W2, I2, S2. Definition conflicts with their usage as template parameters.
+#error DO NOT use defines before including third party header files.
+#endif
+
+#if defined(true)
+#warning The C++ keyword true is defined which may result in subtle compilation problems. Undefining it.
+#undef true
+#endif
+#if defined(false)
+#warning The C++ keyword false is defined which may result in subtle compilation problems. Undefining it.
+#undef false
+#endif
+
+#ifndef __ASSERT_H__
+#define __ASSERT_H__
+#include <assert.h>
+#endif
+#include <limits>
+#ifndef AC_USER_DEFINED_ASSERT
+#include <iostream>
+#else
+#include <ostream>
+#endif
+#include <math.h>
+#include <string>
+
+#ifndef __SYNTHESIS__
+#ifndef __AC_INT_UTILITY_BASE
+#define __AC_INT_UTILITY_BASE
+#endif
+
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+#define AC_MAX(a,b) ((a) > (b) ? (a) : (b))
+#define AC_MIN(a,b) ((a) < (b) ? (a) : (b))
+#define AC_ABS(a) ((a) < 0 ? -(a) : (a))
+
+#if defined(_MSC_VER)
+typedef unsigned __int64 Ulong;
+typedef signed   __int64 Slong;
+#else
+typedef unsigned long long Ulong;
+typedef signed   long long Slong;
+#endif
+
+enum ac_base_mode { AC_BIN=2, AC_OCT=8, AC_DEC=10, AC_HEX=16 };
+enum ac_special_val {AC_VAL_DC, AC_VAL_0, AC_VAL_MIN, AC_VAL_MAX, AC_VAL_QUANTUM};
+
+template <int W, bool S> class ac_int;
+
+namespace ac_private {
+#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+#pragma builtin
+#endif
+
+  enum {long_w = std::numeric_limits<unsigned long>::digits};
+  const unsigned int all_ones = (unsigned) ~0;
+
+  // PRIVATE FUNCTIONS in namespace: for implementing ac_int/ac_fixed
+
+#ifndef __SYNTHESIS__
+  inline double mgc_floor(double d) { return floor(d); }
+#else
+  inline double mgc_floor(double d) { return 0.0; }
+#endif
+
+  #define AC_ASSERT(cond, msg) ac_private::ac_assert(cond, __FILE__, __LINE__, msg)
+  inline void ac_assert(bool condition, const char *file=0, int line=0, const char *msg=0) {
+  #ifndef __SYNTHESIS__
+    #ifndef AC_USER_DEFINED_ASSERT
+    if(!condition) {
+      std::cerr << "Assert";
+      if(file)
+        std::cerr << " in file " << file << ":" << line;
+      if(msg)
+        std::cerr << " " << msg;
+      std::cerr << std::endl;
+      assert(0);
+    }
+    #else
+    AC_USER_DEFINED_ASSERT(condition, file, line, msg);
+    #endif
+  #endif
+  }
+
+  // helper structs for statically computing log2 like functions (nbits, log2_floor, log2_ceil)
+  //   using recursive templates
+  template<unsigned char N>
+  struct s_N {
+    template<unsigned X>
+    struct s_X {
+      enum {
+        X2 = X >> N,
+        N_div_2 = N >> 1,
+        nbits = X ? (X2 ? N + (int) s_N<N_div_2>::template s_X<X2>::nbits : (int) s_N<N_div_2>::template s_X<X>::nbits) : 0
+      };
+    };
+  };
+  template<> struct s_N<0> {
+    template<unsigned X>
+    struct s_X {
+      enum {nbits = !!X };
+    };
+  };
+
+  template<int N>
+  inline double ldexpr32(double d) {
+    double d2 = d;
+    if(N < 0)
+      for(int i=0; i < -N; i++)
+        d2 /= (Ulong) 1 << 32;
+    else
+      for(int i=0; i < N; i++)
+        d2 *= (Ulong) 1 << 32;
+    return d2;
+  }
+  template<> inline double ldexpr32<0>(double d) { return d; }
+  template<> inline double ldexpr32<1>(double d) { return d * ((Ulong) 1 << 32); }
+  template<> inline double ldexpr32<-1>(double d) { return d / ((Ulong) 1 << 32); }
+  template<> inline double ldexpr32<2>(double d) { return (d * ((Ulong) 1 << 32)) * ((Ulong) 1 << 32); }
+  template<> inline double ldexpr32<-2>(double d) { return (d / ((Ulong) 1 << 32)) / ((Ulong) 1 << 32); }
+
+  template<int N>
+  inline double ldexpr(double d) {
+    return ldexpr32<N/32>( N < 0 ? d/( (unsigned) 1 << (-N & 31)) : d * ( (unsigned) 1 << (N & 31)));
+  }
+
+  template<int N>
+  inline void iv_copy(const int *op, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op[i];
+  }
+  template<> inline void iv_copy<1>(const int *op, int *r) {
+    r[0] = op[0];
+  }
+  template<> inline void iv_copy<2>(const int *op, int *r) {
+    r[0] = op[0];
+    r[1] = op[1];
+  }
+
+  template<int N>
+  inline bool iv_equal_zero(const int *op){
+    for(int i=0; i < N; i++)
+      if(op[i])
+        return false;
+    return true;
+  }
+  template<> inline bool iv_equal_zero<0>(const int * /*op*/) { return true; }
+  template<> inline bool iv_equal_zero<1>(const int *op) {
+    return !op[0];
+  }
+  template<> inline bool iv_equal_zero<2>(const int *op) {
+    return !(op[0] || op[1]);
+  }
+
+  template<int N>
+  inline bool iv_equal_ones(const int *op){
+    for(int i=0; i < N; i++)
+      if(~op[i])
+        return false;
+    return true;
+  }
+  template<> inline bool iv_equal_ones<0>(const int * /*op*/) { return true; }
+  template<> inline bool iv_equal_ones<1>(const int *op) {
+    return !~op[0];
+  }
+  template<> inline bool iv_equal_ones<2>(const int *op) {
+    return !(~op[0] || ~op[1]);
+  }
+
+  template<int N1, int N2>
+  inline bool iv_equal(const int *op1, const int *op2){
+    const int M1 = AC_MAX(N1,N2);
+    const int M2 = AC_MIN(N1,N2);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+    for(int i=0; i < M2; i++)
+      if(OP1[i] != OP2[i])
+        return false;
+    int ext = OP2[M2-1] < 0 ? ~0 : 0;
+    for(int i=M2; i < M1; i++)
+      if(OP1[i] != ext)
+        return false;
+    return true;
+  }
+  template<> inline bool iv_equal<1,1>(const int *op1, const int *op2) {
+    return op1[0] == op2[0];
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_ones_from(const int *op){
+    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] >> (B&31))))
+      return false;
+    return iv_equal_ones<N-(B+31)/32>(&op[(B+31)/32]);
+  }
+  template<> inline bool  iv_equal_ones_from<0,1>(const int *op){
+    return iv_equal_ones<1>(op);
+  }
+  template<> inline bool  iv_equal_ones_from<0,2>(const int *op){
+    return iv_equal_ones<2>(op);
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_zeros_from(const int *op){
+    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] >> (B&31))))
+      return false;
+    return iv_equal_zero<N-(B+31)/32>(&op[(B+31)/32]);
+  }
+  template<> inline bool  iv_equal_zeros_from<0,1>(const int *op){
+    return iv_equal_zero<1>(op);
+  }
+  template<> inline bool  iv_equal_zeros_from<0,2>(const int *op){
+    return iv_equal_zero<2>(op);
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_ones_to(const int *op){
+    if((B >= 32*N && op[N-1] >= 0) || (B&31 && ~(op[B/32] | (all_ones << (B&31)))))
+      return false;
+    return iv_equal_ones<B/32>(op);
+  }
+  template<> inline bool  iv_equal_ones_to<0,1>(const int *op){
+    return iv_equal_ones<1>(op);
+  }
+  template<> inline bool  iv_equal_ones_to<0,2>(const int *op){
+    return iv_equal_ones<2>(op);
+  }
+
+  template<int B, int N>
+  inline bool iv_equal_zeros_to(const int *op){
+    if((B >= 32*N && op[N-1] < 0) || (B&31 && (op[B/32] & ~(all_ones << (B&31)))))
+      return false;
+    return iv_equal_zero<B/32>(op);
+  }
+  template<> inline bool  iv_equal_zeros_to<0,1>(const int *op){
+    return iv_equal_zero<1>(op);
+  }
+  template<> inline bool  iv_equal_zeros_to<0,2>(const int *op){
+    return iv_equal_zero<2>(op);
+  }
+
+  template<int N1, int N2, bool greater>
+  inline bool iv_compare(const int *op1, const int *op2){
+    const int M1 = AC_MAX(N1,N2);
+    const int M2 = AC_MIN(N1,N2);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+    const bool b = (N1 >= N2) == greater;
+    int ext = OP2[M2-1] < 0 ? ~0 : 0;
+    int i2 = M1 > M2 ? ext : OP2[M1-1];
+    if(OP1[M1-1] != i2)
+      return b ^ (OP1[M1-1] < i2);
+    for(int i=M1-2; i >= M2; i--) {
+      if((unsigned) OP1[i] != (unsigned) ext)
+        return b ^ ((unsigned) OP1[i] < (unsigned) ext);
+    }
+    for(int i=M2-1; i >= 0; i--) {
+      if((unsigned) OP1[i] != (unsigned) OP2[i])
+        return b ^ ((unsigned) OP1[i] < (unsigned) OP2[i]);
+    }
+    return false;
+  }
+  template<> inline bool iv_compare<1,1,true>(const int *op1, const int *op2) {
+    return op1[0] > op2[0];
+  }
+  template<> inline bool iv_compare<1,1,false>(const int *op1, const int *op2) {
+    return op1[0] < op2[0];
+  }
+
+  template<int N>
+  inline void iv_extend(int *r, int ext) {
+    for(int i=0; i < N; i++)
+      r[i] = ext;
+  }
+  template<> inline void iv_extend<-2>(int * /*r*/, int /*ext*/) { }
+  template<> inline void iv_extend<-1>(int * /*r*/, int /*ext*/) { }
+  template<> inline void iv_extend<0>(int * /*r*/, int /*ext*/) { }
+  template<> inline void iv_extend<1>(int *r, int ext) {
+    r[0] = ext;
+  }
+  template<> inline void iv_extend<2>(int *r, int ext) {
+    r[0] = ext;
+    r[1] = ext;
+  }
+
+  template<int Nr>
+  inline void iv_assign_int64(int *r, Slong l) {
+    r[0] = (int) l;
+    if(Nr > 1) {
+      r[1] = (int) (l >> 32);
+      iv_extend<Nr-2>(r+2, (r[1] < 0) ? ~0 : 0);
+    }
+  }
+  template<> inline void iv_assign_int64<1>(int *r, Slong l) {
+    r[0] = (int) l;
+  }
+  template<> inline void iv_assign_int64<2>(int *r, Slong l) {
+    r[0] = (int) l;
+    r[1] = (int) (l >> 32);
+  }
+
+  template<int Nr>
+  inline void iv_assign_uint64(int *r, Ulong l) {
+    r[0] = (int) l;
+    if(Nr > 1) {
+      r[1] = (int) (l >> 32);
+      iv_extend<Nr-2>(r+2, 0);
+    }
+  }
+  template<> inline void iv_assign_uint64<1>(int *r, Ulong l) {
+    r[0] = (int) l;
+  }
+  template<> inline void iv_assign_uint64<2>(int *r, Ulong l) {
+    r[0] = (int) l;
+    r[1] = (int) (l >> 32);
+  }
+
+  inline Ulong mult_u_u(int a, int b) {
+    return (Ulong) (unsigned) a * (Ulong) (unsigned) b;
+  }
+  inline Slong mult_u_s(int a, int b) {
+    return (Ulong) (unsigned) a * (Slong) (signed) b;
+  }
+  inline Slong mult_s_u(int a, int b) {
+    return (Slong) (signed) a * (Ulong) (unsigned) b;
+  }
+  inline Slong mult_s_s(int a, int b) {
+    return (Slong) (signed) a * (Slong) (signed) b;
+  }
+  inline void accumulate(Ulong a, Ulong &l1, Slong &l2) {
+    l1 += (Ulong) (unsigned) a;
+    l2 += a >> 32;
+  }
+  inline void accumulate(Slong a, Ulong &l1, Slong &l2) {
+    l1 += (Ulong) (unsigned) a;
+    l2 += a >> 32;
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_mult(const int *op1, const int *op2, int *r) {
+    if(Nr==1)
+      r[0] = op1[0] * op2[0];
+    else if(N1==1 && N2==1)
+      iv_assign_int64<Nr>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
+    else {
+      const int M1 = AC_MAX(N1,N2);
+      const int M2 = AC_MIN(N1,N2);
+      const int *OP1 = N1 >= N2 ? op1 : op2;
+      const int *OP2 = N1 >= N2 ? op2 : op1;
+      const int T1 = AC_MIN(M2-1,Nr);
+      const int T2 = AC_MIN(M1-1,Nr);
+      const int T3 = AC_MIN(M1+M2-2,Nr);
+
+      Ulong l1 = 0;
+      Slong l2 = 0;
+      for(int k=0; k < T1; k++) {
+        for(int i=0; i < k+1; i++)
+          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
+        l2 += (Ulong) (unsigned) (l1 >> 32);
+        r[k] = (int) l1;
+        l1 = (unsigned) l2;
+        l2 >>= 32;
+      }
+      for(int k=T1; k < T2; k++) {
+        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
+        for(int i=0; i < M2-1; i++)
+          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
+        l2 += (Ulong) (unsigned) (l1 >> 32);
+        r[k] = (int) l1;
+        l1 = (unsigned) l2;
+        l2 >>= 32;
+      }
+      for(int k=T2; k < T3; k++) {
+        accumulate(mult_u_s(OP1[k-M2+1], OP2[M2-1]), l1, l2);
+        for(int i=k-T2+1; i < M2-1; i++)
+          accumulate(mult_u_u(OP1[k-i], OP2[i]), l1, l2);
+        accumulate(mult_s_u(OP1[M1-1], OP2[k-M1+1]), l1, l2);
+        l2 += (Ulong) (unsigned) (l1 >> 32);
+        r[k] = (int) l1;
+        l1 = (unsigned) l2;
+        l2 >>= 32;
+      }
+      if(Nr >= M1+M2-1) {
+        accumulate(mult_s_s(OP1[M1-1], OP2[M2-1]), l1, l2);
+        r[M1+M2-2] = (int) l1;
+        if(Nr >= M1+M2) {
+          l2 += (Ulong) (unsigned) (l1 >> 32);
+          r[M1+M2-1] = (int) l2;
+          iv_extend<Nr-(M1+M2)>(r+M1+M2, (r[M1+M2-1] < 0) ? ~0 : 0);
+        }
+      }
+    }
+  }
+  template<> inline void iv_mult<1,1,1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] * op2[0];
+  }
+  template<> inline void iv_mult<1,1,2>(const int *op1, const int *op2, int *r) {
+    iv_assign_int64<2>(r, ((Slong) op1[0]) * ((Slong) op2[0]));
+  }
+
+  template<int N>
+  inline bool iv_uadd_carry(const int *op1, bool carry, int *r) {
+    Slong l = carry;
+    for(int i=0; i < N; i++) {
+      l += (Ulong) (unsigned) op1[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    return l != 0;
+  }
+  template<> inline bool iv_uadd_carry<0>(const int * /*op1*/, bool carry, int * /*r*/) { return carry; }
+  template<> inline bool iv_uadd_carry<1>(const int *op1, bool carry, int *r) {
+    Ulong l = carry + (Ulong) (unsigned) op1[0];
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_add_int_carry(const int *op1, int op2, bool carry, int *r) {
+    if(N==0)
+      return carry;
+    if(N==1) {
+      Ulong l = carry + (Slong) op1[0] + (Slong) op2;
+      r[0] = (int) l;
+      return (l >> 32) & 1;
+    }
+    Slong l = carry + (Ulong) (unsigned) op1[0] + (Slong) op2;
+    r[0] = (int) l;
+    l >>= 32;
+    for(int i=1; i < N-1; i++) {
+      l += (Ulong) (unsigned) op1[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    l += (Slong) op1[N-1];
+    r[N-1] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_add_int_carry<0>(const int * /*op1*/, int /*op2*/, bool carry, int * /*r*/) { return carry; }
+  template<> inline bool iv_add_int_carry<1>(const int *op1, int op2, bool carry, int *r) {
+    Ulong l = carry + (Slong) op1[0] + (Slong) op2;
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_uadd_n(const int *op1, const int *op2, int *r) {
+    Ulong l = 0;
+    for(int i=0; i < N; i++) {
+      l += (Ulong)(unsigned) op1[i] + (Ulong)(unsigned) op2[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    return l & 1;
+  }
+  template<> inline bool iv_uadd_n<0>(const int * /*op1*/, const int * /*op2*/, int * /*r*/) { return false; }
+  template<> inline bool iv_uadd_n<1>(const int *op1, const int *op2, int *r) {
+    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_uadd_n<2>(const int *op1, const int *op2, int *r) {
+    Ulong l = (Ulong) (unsigned) op1[0] + (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    l >>= 32;
+    l += (Ulong) (unsigned) op1[1] + (Ulong) (unsigned) op2[1];
+    r[1] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_add(const int *op1, const int *op2, int *r) {
+    if(Nr==1)
+      r[0] = op1[0] + op2[0];
+    else {
+      const int M1 = AC_MAX(N1,N2);
+      const int M2 = AC_MIN(N1,N2);
+      const int *OP1 = N1 >= N2 ? op1 : op2;
+      const int *OP2 = N1 >= N2 ? op2 : op1;
+      const int T1 = AC_MIN(M2-1,Nr);
+      const int T2 = AC_MIN(M1,Nr);
+
+      bool carry = iv_uadd_n<T1>(OP1, OP2, r);
+      carry = iv_add_int_carry<T2-T1>(OP1+T1, OP2[T1], carry, r+T1);
+      iv_extend<Nr-T2>(r+T2, carry ? ~0 : 0);
+    }
+  }
+  template<> inline void iv_add<1,1,1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] + op2[0];
+  }
+  template<> inline void iv_add<1,1,2>(const int *op1, const int *op2, int *r) {
+    iv_assign_int64<2>(r, (Slong) op1[0] + (Slong) op2[0]);
+  }
+
+  template<int N>
+  inline bool iv_sub_int_borrow(const int *op1, int op2, bool borrow, int *r) {
+    if(N==1) {
+      Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
+      r[0] = (int) l;
+      return (l >> 32) & 1;
+    }
+    Slong l = (Ulong) (unsigned) op1[0] - (Slong) op2 - borrow;
+    r[0] = (int) l;
+    l >>= 32;
+    for(int i=1; i < N-1; i++) {
+      l += (Ulong) (unsigned) op1[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    l += (Slong) op1[N-1];
+    r[N-1] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_sub_int_borrow<0>(const int * /*op1*/, int /*op2*/, bool borrow, int * /*r*/) { return borrow; }
+  template<> inline bool iv_sub_int_borrow<1>(const int *op1, int op2, bool borrow, int *r) {
+    Ulong l = (Slong) op1[0] - (Slong) op2 - borrow;
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_sub_int_borrow(int op1, const int *op2, bool borrow, int *r) {
+    if(N==1) {
+      Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
+      r[0] = (int) l;
+      return (l >> 32) & 1;
+    }
+    Slong l = (Slong) op1 - (Ulong) (unsigned) op2[0] - borrow;
+    r[0] = (int) l;
+    l >>= 32;
+    for(int i=1; i < N-1; i++) {
+      l -= (Ulong) (unsigned) op2[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    l -= (Slong) op2[N-1];
+    r[N-1] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_sub_int_borrow<0>(int /*op1*/, const int * /*op2*/, bool borrow, int * /*r*/) { return borrow; }
+  template<> inline bool iv_sub_int_borrow<1>(int op1, const int *op2, bool borrow, int *r) {
+    Ulong l = (Slong) op1 - (Slong) op2[0] - borrow;
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N>
+  inline bool iv_usub_n(const int *op1, const int *op2, int *r) {
+    Slong l = 0;
+    for(int i=0; i < N; i++) {
+      l += (Ulong)(unsigned) op1[i] - (Ulong)(unsigned) op2[i];
+      r[i] = (int) l;
+      l >>= 32;
+    }
+    return l & 1;
+  }
+  template<> inline bool iv_usub_n<1>(const int *op1, const int *op2, int *r) {
+    Ulong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    return (l >> 32) & 1;
+  }
+  template<> inline bool iv_usub_n<2>(const int *op1, const int *op2, int *r) {
+    Slong l = (Ulong) (unsigned) op1[0] - (Ulong) (unsigned) op2[0];
+    r[0] = (int) l;
+    l >>= 32;
+    l += (Ulong) (unsigned) op1[1] - (Ulong) (unsigned) op2[1];
+    r[1] = (int) l;
+    return (l >> 32) & 1;
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_sub(const int *op1, const int *op2, int *r) {
+    if(Nr==1)
+      r[0] = op1[0] - op2[0];
+    else {
+      const int M1 = AC_MAX(N1,N2);
+      const int M2 = AC_MIN(N1,N2);
+      const int T1 = AC_MIN(M2-1,Nr);
+      const int T2 = AC_MIN(M1,Nr);
+      bool borrow = iv_usub_n<T1>(op1, op2, r);
+      if(N1 > N2)
+        borrow = iv_sub_int_borrow<T2-T1>(op1+T1, op2[T1], borrow, r+T1);
+      else
+        borrow = iv_sub_int_borrow<T2-T1>(op1[T1], op2+T1, borrow, r+T1);
+      iv_extend<Nr-T2>(r+T2, borrow ? ~0 : 0);
+    }
+  }
+  template<> inline void iv_sub<1,1,1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] - op2[0];
+  }
+  template<> inline void iv_sub<1,1,2>(const int *op1, const int *op2, int *r) {
+    iv_assign_int64<2>(r, (Slong) op1[0] - (Slong) op2[0]);
+  }
+
+  template<int N>
+  inline bool iv_all_bits_same(const int *op, bool bit) {
+    int t = bit ? ~0 : 0;
+    for(int i=0; i < N; i++)
+      if(op[i] != t)
+        return false;
+    return true;
+  }
+  template<> inline bool iv_all_bits_same<0>(const int * /*op*/, bool /*bit*/) { return true; }
+  template<> inline bool iv_all_bits_same<1>(const int *op, bool bit) {
+    return op[0] == (bit ? ~0 : 0);
+  }
+
+  template <int N, int Nr>
+  void iv_neg(const int *op1, int *r) {
+    Slong l = 0;
+    for(int k = 0; k < AC_MIN(N,Nr); k++) {
+      l -= (Ulong) (unsigned) op1[k];
+      r[k] = (unsigned) l;
+      l >>= 32;
+    }
+    if(Nr > N) {
+      r[N] = (unsigned) (l - (op1[N-1] < 0 ? ~0 : 0));
+      iv_extend<Nr-N-1>(r+N+1, r[N] < 0 ? ~0 : 0);
+    }
+  }
+
+  template <int N, bool S, int Nr>
+  void iv_abs(const int *op1, int *r) {
+    if( S && op1[N-1] < 0) {
+      iv_neg<N,Nr>(op1, r);
+    } else {
+      iv_copy<AC_MIN(N,Nr)>(op1, r);
+      iv_extend<Nr-N>(r+N, 0);
+    }
+  }
+
+  template<int N, int D, int Q, int R, typename sw2, typename uw2, typename sw4, typename uw4, int w1_length>
+  void iv_udiv(const sw2 *n, const sw2 *d, sw2 *q, sw2 *r) {
+    const int w2_length = 2*w1_length;
+    int d_msi;  // most significant int for d
+    for(d_msi = D-1; d_msi > 0 && !d[d_msi]; d_msi--) {}
+    uw4 d1 = 0;
+    if(!d_msi && !d[0]) {
+      d1 = n[0]/d[0];  // d is zero => divide by zero
+      return;
+    }
+    int n_msi;  // most significant int for n
+    for(n_msi = N-1; n_msi > 0 && !n[n_msi]; n_msi--) {}
+    for(int i=0; i < Q; i++)
+      q[i] = 0;
+    for(int i=0; i < R; i++)
+      r[i] = n[i];
+    // write most significant "words" into d1
+    bool d_mss_odd = (bool) (d[d_msi] >> w1_length);
+    int d_mss= 2*d_msi + d_mss_odd;  // index to most significant short (16-bit)
+    d1 = (uw4) (uw2) d[d_msi] << (w1_length << (int) !d_mss_odd);
+    if(d_msi)
+      d1 |= (uw2) d[d_msi-1] >> (d_mss_odd ? w1_length : 0);
+    bool n_mss_odd = (bool) (n[n_msi] >> w1_length);
+    int n_mss = 2*n_msi + n_mss_odd;
+    if(n_mss < d_mss) {
+      // q already initialized to 0
+      if(R) {
+        int r_msi = AC_MIN(R-1, n_msi);
+        for(int j = 0; j <= r_msi; j++)
+          r[j] = n[j];
+        for(int j = r_msi+1; j < R; j++)
+          r[j] = 0;
+      }
+    } else {
+      uw2 r1[N+1];
+      r1[n_msi+1] = 0;
+      for(int k = n_msi; k >= 0; k--)
+        r1[k] = n[k];
+      for(int k = n_mss; k >=d_mss; k--) {
+        int k_msi = k >> 1;
+        bool odd = k & 1;
+        uw2 r1m1 = k_msi > 0 ? r1[k_msi-1] : (uw2) 0;
+        uw4 n1 = odd ?
+          (uw4) ((r1[k_msi+1] << w1_length) | (r1[k_msi] >> w1_length)) << w2_length | ((r1[k_msi] << w1_length) | (r1m1 >> w1_length)) :
+          (uw4) r1[k_msi] << w2_length | r1m1;
+        uw2 q1 = n1/d1;
+        if(q1 >> w1_length)
+          q1--;
+        AC_ASSERT(!(q1 >> w1_length), "Problem detected in long division algorithm, Please report");
+        unsigned k2 = k - d_mss;
+        unsigned k2_i = k2 >> 1;
+        bool odd_2 = k2 & 1;
+        uw2 q2 = q1 << (odd_2 ? w1_length : 0);
+        sw4 l = 0;
+        for(int j = 0; j <= d_msi; j++) {
+          l += r1[k2_i + j];
+          bool l_sign = l < 0;
+          sw4 prod = (uw4) (uw2) d[j] * (uw4) q2;
+          l -= prod;
+          bool ov1 = (l >= 0) & ((prod < 0) | l_sign);
+          bool ov2 = (l < 0) & (prod < 0) & l_sign;
+          r1[k2_i + j] = (uw2) l;
+          l >>= w2_length;
+          if(ov1)
+            l |= ((uw4) -1 << w2_length);
+          if(ov2)
+            l ^= ((sw4) 1 << w2_length);
+        }
+        if(odd_2 | d_mss_odd) {
+          l += r1[k2_i + d_msi + 1];
+          r1[k2_i + d_msi + 1] = (uw2) l;
+        }
+        if(l < 0) {
+          l = 0;
+          for(int j = 0; j <= d_msi; j++) {
+            l += (sw4) (uw2) d[j] << (odd_2 ? w1_length : 0);
+            l += r1[k2_i + j];
+            r1[k2_i + j] = (uw2) l;
+            l >>= w2_length;
+          }
+          if(odd_2 | d_mss_odd)
+            r1[k2_i + d_msi + 1] += (uw2) l;
+          q1--;
+        }
+        if(Q && k2_i < Q) {
+          if(odd_2)
+            q[k2_i] = q1 << w1_length;
+          else
+            q[k2_i] |= q1;
+        }
+      }
+      if(R) {
+        int r_msi = AC_MIN(R-1, n_msi);
+        for(int j = 0; j <= r_msi; j++)
+          r[j] = r1[j];
+        for(int j = r_msi+1; j < R; j++)
+          r[j] = 0;
+      }
+    }
+  }
+
+  template<int N1, int Num_s, int N2, int Den_s, int Nr>
+  inline void iv_div(const int *op1, const int *op2, int *r) {
+    enum { N1_over = N1+(Den_s && (Num_s==2)) };
+    if(N1_over==1 && N2==1) {
+      r[0] = op1[0] / op2[0];
+      iv_extend<Nr-N1>(r+1, ((Num_s || Den_s) && (r[0] < 0)) ? ~0 : 0);
+    }
+    else if(N1_over==1 && N2==2)
+      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(N1_over==2 && N2==1)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / ( (Slong) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / ( (Slong) op2[0]) );
+    else if(N1_over==2 && N2==2)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) / (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(!Num_s && !Den_s) {
+      iv_udiv<N1,N2,Nr,0,int,unsigned,Slong,Ulong,16>(op1, op2, r, 0);
+    }
+    else {
+      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
+      int numerator[N1_neg];
+      int denominator[N2_neg];
+      int quotient[N1_neg];
+      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
+      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
+      iv_udiv<N1_neg,N2_neg,N1_neg,0,int,unsigned,Slong,Ulong,16>(numerator, denominator, quotient, 0);
+      if( (Num_s && op1[N1-1] < 0) ^ (Den_s && op2[N2-1] < 0) )
+        iv_neg<N1_neg, Nr>(quotient, r);
+      else {
+        iv_copy<AC_MIN(N1_neg,Nr)>(quotient, r);
+        iv_extend<Nr-N1_neg>(r+N1_neg, (Num_s || Den_s) && r[N1_neg-1] < 0 ? ~0 : 0);
+      }
+    }
+  }
+
+  template<int N1, int Num_s, int N2, int Den_s, int Nr>
+  inline void iv_rem(const int *op1, const int *op2, int *r) {
+    enum { N1_over = N1+(Den_s && (Num_s==2)) };   // N1_over corresponds to the division
+    if(N1_over==1 && N2==1) {
+      r[0] = op1[0] % op2[0];
+      iv_extend<Nr-1>(r+1, Num_s && r[0] < 0 ? ~0 : 0);
+    }
+    else if(N1_over==1 && N2==2)
+      iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(N1_over==2 && N2==1)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % ( (Slong) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % ( (Slong) op2[0]) );
+    else if(N1_over==2 && N2==2)
+      if(N1 == 1)
+        iv_assign_int64<Nr>(r, ( (Slong) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+      else
+        iv_assign_int64<Nr>(r, (((Slong) op1[1] << 32) | (unsigned) op1[0]) % (((Slong) op2[1] << 32) | (unsigned) op2[0]) );
+    else if(!Num_s && !Den_s) {
+      iv_udiv<N1,N2,0,Nr,int,unsigned,Slong,Ulong,16>(op1, op2, 0, r);
+    }
+    else {
+      enum { N1_neg = N1+(Num_s==2), N2_neg = N2+(Den_s==2)};
+      int numerator[N1_neg];
+      int denominator[N2_neg];
+      int remainder[N2];
+      iv_abs<N1, (bool) Num_s, N1_neg>(op1, numerator);
+      iv_abs<N2, (bool) Den_s, N2_neg>(op2, denominator);
+      iv_udiv<N1_neg,N2_neg,0,N2,int,unsigned,Slong,Ulong,16>(numerator, denominator, 0, remainder);
+      if( (Num_s && op1[N1-1] < 0) )
+        iv_neg<N2, Nr>(remainder, r);
+      else {
+        iv_copy<AC_MIN(N2,Nr)>(remainder, r);
+        iv_extend<Nr-N2>(r+N2, Num_s && r[N2-1] < 0 ? ~0 : 0);
+      }
+    }
+  }
+
+  template<int N>
+  inline void iv_bitwise_complement_n(const int *op, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = ~op[i];
+  }
+  template<> inline void iv_bitwise_complement_n<1>(const int *op, int *r) {
+    r[0] = ~op[0];
+  }
+  template<> inline void iv_bitwise_complement_n<2>(const int *op, int *r) {
+    r[0] = ~op[0];
+    r[1] = ~op[1];
+  }
+
+  template<int N, int Nr>
+  inline void iv_bitwise_complement(const int *op, int *r) {
+    const int M = AC_MIN(N,Nr);
+    iv_bitwise_complement_n<M>(op, r);
+    iv_extend<Nr-M>(r+M, (r[M-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N>
+  inline void iv_bitwise_and_n(const int *op1, const int *op2, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op1[i] & op2[i];
+  }
+  template<> inline void iv_bitwise_and_n<1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] & op2[0];
+  }
+  template<> inline void iv_bitwise_and_n<2>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] & op2[0];
+    r[1] = op1[1] & op2[1];
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_bitwise_and(const int *op1, const int *op2, int *r) {
+    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
+    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
+    const int *OP1 = N1 > N2 ? op1 : op2;
+    const int *OP2 = N1 > N2 ? op2 : op1;
+
+    iv_bitwise_and_n<M2>(op1, op2, r);
+    if(OP2[M2-1] < 0)
+      iv_copy<M1-M2>(OP1+M2, r+M2);
+    else
+      iv_extend<M1-M2>(r+M2, 0);
+    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N>
+  inline void iv_bitwise_or_n(const int *op1, const int *op2, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op1[i] | op2[i];
+  }
+  template<> inline void iv_bitwise_or_n<1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] | op2[0];
+  }
+  template<> inline void iv_bitwise_or_n<2>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] | op2[0];
+    r[1] = op1[1] | op2[1];
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_bitwise_or(const int *op1, const int *op2, int *r) {
+    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
+    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+
+    iv_bitwise_or_n<M2>(op1, op2, r);
+    if(OP2[M2-1] < 0)
+      iv_extend<M1-M2>(r+M2, ~0);
+    else
+      iv_copy<M1-M2>(OP1+M2, r+M2);
+    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N>
+  inline void iv_bitwise_xor_n(const int *op1, const int *op2, int *r) {
+    for(int i=0; i < N; i++)
+      r[i] = op1[i] ^ op2[i];
+  }
+  template<> inline void iv_bitwise_xor_n<1>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] ^ op2[0];
+  }
+  template<> inline void iv_bitwise_xor_n<2>(const int *op1, const int *op2, int *r) {
+    r[0] = op1[0] ^ op2[0];
+    r[1] = op1[1] ^ op2[1];
+  }
+
+  template<int N1, int N2, int Nr>
+  inline void iv_bitwise_xor(const int *op1, const int *op2, int *r) {
+    const int M1 = AC_MIN(AC_MAX(N1,N2), Nr);
+    const int M2 = AC_MIN(AC_MIN(N1,N2), Nr);
+    const int *OP1 = N1 >= N2 ? op1 : op2;
+    const int *OP2 = N1 >= N2 ? op2 : op1;
+
+    iv_bitwise_xor_n<M2>(op1, op2, r);
+    if(OP2[M2-1] < 0)
+      iv_bitwise_complement_n<M1-M2>(OP1+M2, r+M2);
+    else
+      iv_copy<M1-M2>(OP1+M2, r+M2);
+    iv_extend<Nr-M1>(r+M1, (r[M1-1] < 0) ? ~0 : 0);
+  }
+
+  template<int N, int Nr>
+  inline void iv_shift_l(const int *op1, unsigned op2, int *r) {
+    AC_ASSERT(Nr <= N, "iv_shift_l, incorrect usage Nr > N");
+    unsigned s31 = op2 & 31;
+    unsigned ishift = (op2 >> 5) > Nr ? Nr : (op2 >> 5);
+    if(s31 && ishift!=Nr) {
+      unsigned lw = 0;
+      for(unsigned i=0; i < Nr; i++) {
+        unsigned hw = (i >= ishift) ? op1[i-ishift] : 0;
+        r[i] = (hw << s31) | (lw >> (32-s31));
+        lw = hw;
+      }
+    } else {
+      for(unsigned i=0; i < Nr ; i++)
+        r[i] = (i >= ishift) ? op1[i-ishift] : 0;
+    }
+  }
+
+  template<int N, int Nr>
+  inline void iv_shift_r(const int *op1, unsigned op2, int *r) {
+    unsigned s31 = op2 & 31;
+    unsigned ishift = (op2 >> 5) > N ? N : (op2 >> 5);
+    int ext = op1[N-1] < 0 ? ~0 : 0;
+    if(s31 && ishift!=N) {
+      unsigned lw = (ishift < N) ? op1[ishift] : ext;
+      for(unsigned i=0; i < Nr; i++) {
+        unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
+        r[i] = (lw >> s31) | (hw << (32-s31));
+        lw = hw;
+      }
+    } else {
+      for(unsigned i=0; i < Nr ; i++)
+        r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
+    }
+  }
+
+  template<int N, int Nr, bool S>
+  inline void iv_shift_l2(const int *op1, signed op2, int *r) {
+    if(S && op2 < 0)
+      iv_shift_r<N,Nr>(op1, -op2, r);
+    else
+      iv_shift_l<N,Nr>(op1, op2, r);
+  }
+
+  template<> inline void iv_shift_l2<1,1,false>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0;
+  }
+  template<> inline void iv_shift_l2<1,1,true>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 >= 0) ?
+      (op2 < 32) ? ( (unsigned) op1[0] << op2) : 0 :
+      (op2 > -32) ? (op1[0] >> -op2) : (op1[0] >> 31);
+  }
+
+  template<int N, int Nr, bool S>
+  inline void iv_shift_r2(const int *op1, signed op2, int *r) {
+    if(S && op2 < 0)
+      iv_shift_l<N,Nr>(op1, -op2, r);
+    else
+      iv_shift_r<N,Nr>(op1, op2, r);
+  }
+
+  template<> inline void iv_shift_r2<1,1,false>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31);
+  }
+  template<> inline void iv_shift_r2<1,1,true>(const int *op1, signed op2, int *r) {
+    r[0] = (op2 >= 0) ?
+      (op2 < 32) ? (op1[0] >> op2) : (op1[0] >> 31) :
+      (op2 > -32) ? ( (unsigned) op1[0] << -op2) : 0;
+  }
+
+  template<int N, int Nr, int B>
+  inline void iv_const_shift_l(const int *op1, int *r) {
+    // B >= 0
+    if(!B) {
+      const int M1 = AC_MIN(N,Nr);
+      iv_copy<M1>(op1, r);
+      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
+    }
+    else {
+      const unsigned s31 = B & 31;
+      const int ishift = (((B >> 5) > Nr) ? Nr : (B >> 5));
+      iv_extend<ishift>(r, 0);
+      const int M1 = AC_MIN(N+ishift,Nr);
+      if(s31) {
+        unsigned lw = 0;
+        for(int i=ishift; i < M1; i++) {
+          unsigned hw = op1[i-ishift];
+          r[i] = (hw << s31) | (lw >> ((32-s31)&31));  // &31 is to quiet compilers
+          lw = hw;
+        }
+        if(Nr > M1) {
+          r[M1] = (signed) lw >> ((32-s31)&31);  // &31 is to quiet compilers
+          iv_extend<Nr-M1-1>(r+M1+1, r[M1] < 0 ? ~0 : 0);
+        }
+      } else {
+        for(int i=ishift; i < M1 ; i++)
+          r[i] = op1[i-ishift];
+        iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? -1 : 0);
+      }
+    }
+  }
+  template<> inline void iv_const_shift_l<1,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+  template<> inline void iv_const_shift_l<2,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+
+  template<int N, int Nr, int B>
+  inline void iv_const_shift_r(const int *op1, int *r) {
+    if(!B) {
+      const int M1 = AC_MIN(N,Nr);
+      iv_copy<M1>(op1, r);
+      iv_extend<Nr-M1>(r+M1, r[M1-1] < 0 ? ~0 : 0);
+    }
+    else {
+      const unsigned s31 = B & 31;
+      const int ishift = (((B >> 5) > N) ? N : (B >> 5));
+      int ext = op1[N-1] < 0 ? ~0 : 0;
+      if(s31 && ishift!=N) {
+        unsigned lw = (ishift < N) ? op1[ishift] : ext;
+        for(int i=0; i < Nr; i++) {
+          unsigned hw = (i+ishift+1 < N) ? op1[i+ishift+1] : ext;
+          r[i] = (lw >> s31) | (hw << ((32-s31)&31));  // &31 is to quiet compilers
+          lw = hw;
+        }
+      } else {
+        for(int i=0; i < Nr ; i++)
+          r[i] = (i+ishift < N) ? op1[i+ishift] : ext;
+      }
+    }
+  }
+  template<> inline void iv_const_shift_r<1,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+  template<> inline void iv_const_shift_r<2,1,0>(const int *op1, int *r) {
+    r[0] = op1[0];
+  }
+
+  template<int N>
+  inline void iv_conv_from_fraction(double d, int *r, bool *qb, bool *rbits, bool *o) {
+    bool b = d < 0;
+    double d2 = b ? -d : d;
+    double dfloor = mgc_floor(d2);
+    *o = dfloor != 0.0;
+    d2 = d2 - dfloor;
+    for(int i=N-1; i >=0; i--) {
+      d2 *= (Ulong) 1 << 32;
+      unsigned k = (unsigned int) d2;
+      r[i] = b ? ~k : k;
+      d2 -= k;
+    }
+    d2 *= 2;
+    bool k = ((int) d2) != 0;  // is 0 or 1
+    d2 -= k;
+    *rbits = d2 != 0.0;
+    *qb = (b && *rbits) ^ k;
+    if(b && !*rbits && !*qb)
+      iv_uadd_carry<N>(r, true, r);
+    *o |= b ^ (r[N-1] < 0);
+  }
+
+  template<ac_base_mode b>
+  inline int to_str(int *v, int w, bool left_just, char *r) {
+    const char digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+    const unsigned char B = b==AC_BIN ? 1 : (b==AC_OCT ? 3 : (b==AC_HEX ? 4 : 0));
+    int k = (w+B-1)/B;
+    int n = (w+31) >> 5;
+    int bits = 0;
+    if(b != AC_BIN && left_just) {
+      if( (bits = -(w % B)) )
+        r[--k] = 0;
+    }
+    for(int i = 0; i < n; i++) {
+      if (b != AC_BIN && bits < 0)
+        r[k] += (unsigned char) (( (unsigned) v[i] << (B+bits)) & (b-1));
+      unsigned int m = (unsigned) v[i] >> -bits;
+      for(bits += 32; bits > 0 && k; bits -= B) {
+        r[--k] = (char) (m & (b-1));
+        m >>= B;
+      }
+    }
+    for(int i=0; i < (w+B-1)/B; i++)
+      r[i] = digits[(int)r[i]];
+    return (w+B-1)/B;
+  }
+  template<> inline int to_str<AC_DEC>(int *v, int w, bool left_just, char *r) {
+    int k = 0;
+    int msw = (w-1) >> 5;
+    if(left_just) {
+      unsigned bits_msw = w & 31;
+      if(bits_msw) {
+        unsigned left_shift = 32 - bits_msw;
+        for(int i=msw; i > 0; i--)
+          v[i] = (unsigned) v[i] << left_shift | (unsigned) v[i-1] >> bits_msw;
+        v[0] = (unsigned) v[0] << left_shift;
+      }
+      int lsw = 0;
+      while(lsw < msw || v[msw] ) {
+        Ulong l = 0;
+        for(int i=lsw; i <= msw; i++) {
+          l += (Ulong) (unsigned) v[i] * 10;
+          v[i] = l;
+          l >>= 32;
+          if(i==lsw && !v[i])
+            lsw++;
+        }
+        r[k++] = (char) ('0' + (int) l);
+      }
+    } else {
+      const unsigned d = 1000000000;   // 10E9
+      for(; msw > 0 && !v[msw]; msw--) {}
+      while(msw >= 0) {
+        Ulong nl = 0;
+        for(int i = msw; i >= 0; i--) {
+          nl <<= 32;
+          nl |= (unsigned) v[i];
+          unsigned q = nl/d;
+          nl -= (Ulong) q * d;
+          v[i] = q;
+        }
+        if(!v[msw])
+          msw--;
+        bool last = msw == -1;
+        unsigned rem = (unsigned) nl;
+        for(int i=0; (i < 9 && !last) || rem; i++) {
+          r[k++] = (char) ('0' + (int) (rem % 10));
+          rem /= 10;
+        }
+      }
+      for(int i=0; i < k/2; i++) {
+        char c = r[i];
+        r[i] = r[k-1-i];
+        r[k-1-i] = c;
+      }
+    }
+    r[k] = 0;
+    return k;
+  }
+
+  inline int to_string(int *v, int w, bool sign_mag, ac_base_mode base, bool left_just, char *r) {
+    int n = (w+31) >> 5;
+    bool neg = !sign_mag && v[n-1] < 0;
+    if(!left_just) {
+      while(n-- && v[n] == (neg ? ~0 : 0)) {}
+      int w2 = 32*(n+1);
+      if(w2) {
+        int m = v[n];
+        for(int i = 16; i > 0; i >>= 1) {
+          if((m >> i) == (neg ? ~0 : 0))
+            w2 -= i;
+          else
+            m >>= i;
+        }
+      }
+      if(w2 < w)
+        w = w2;
+      w += !sign_mag;
+    }
+    if(base == AC_DEC)
+      return to_str<AC_DEC>(v, w, left_just, r);
+    else if (base == AC_HEX)
+      return to_str<AC_HEX>(v, w, left_just, r);
+    else if (base == AC_OCT)
+      return to_str<AC_OCT>(v, w, left_just, r);
+    else if (base == AC_BIN)
+      return to_str<AC_BIN>(v, w, left_just, r);
+    return 0;
+  }
+
+  template<int N>
+  inline unsigned iv_leading_bits(const int *op, bool bit);
+
+  template<> inline unsigned iv_leading_bits<1>(const int *op, bool bit) {
+    const unsigned char tab[] = {4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0};
+    unsigned t = bit ? ~*op : *op;
+    unsigned cnt = 0;
+    if(t >> 16)
+      t >>= 16;
+    else
+      cnt += 16;
+    if(t >> 8)
+      t >>= 8;
+    else
+      cnt += 8;
+    if(t >> 4)
+      t >>= 4;
+    else
+      cnt += 4;
+    cnt += tab[t];
+    return cnt;
+  }
+
+  template<int N>
+  inline unsigned iv_leading_bits(const int *op, bool bit) {
+    int ext_sign = bit ? -1 : 0;
+    int k;
+    for(k = N-1; k >= 0 && op[k] == ext_sign; k--) {}
+    return 32*(N-1-k) + (k < 0 ? 0 : iv_leading_bits<1>(op+k, bit));
+  }
+
+  //////////////////////////////////////////////////////////////////////////////
+  //  Integer Vector class: iv
+  //////////////////////////////////////////////////////////////////////////////
+  template<int N>
+  class iv {
+  protected:
+    int v[N];
+  public:
+    template<int N2> friend class iv;
+    iv() {}
+    template<int N2>
+    iv ( const iv<N2> &b ) {
+      const int M = AC_MIN(N,N2);
+      iv_copy<M>(b.v, v);
+      iv_extend<N-M>(v+M, (v[M-1] < 0) ? ~0 : 0);
+    }
+    iv ( Slong t) {
+      iv_assign_int64<N>(v, t);
+    }
+    iv ( Ulong t) {
+      iv_assign_uint64<N>(v, t);
+    }
+    iv ( int t) {
+      v[0] = t;
+      iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
+    }
+    iv ( unsigned int t) {
+      v[0] = t;
+      iv_extend<N-1>(v+1, 0);
+    }
+    iv ( long t) {
+      if(long_w == 32) {
+        v[0] = t;
+        iv_extend<N-1>(v+1, (t < 0) ? ~0 : 0);
+      } else
+        iv_assign_int64<N>(v, t);
+    }
+    iv ( unsigned long t) {
+      if(long_w == 32) {
+        v[0] = t;
+        iv_extend<N-1>(v+1, 0);
+      } else
+        iv_assign_uint64<N>(v, t);
+    }
+    iv ( double d ) {
+      double d2 = ldexpr32<-N>(d);
+      bool qb, rbits, o;
+      iv_conv_from_fraction<N>(d2, v, &qb, &rbits, &o);
+    }
+
+    // Explicit conversion functions to C built-in types -------------
+    inline Slong to_int64() const { return N==1 ? v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
+    inline Ulong to_uint64() const { return N==1 ? (Ulong) v[0] : ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0]; }
+    inline double to_double() const {
+      double a = v[N-1];
+      for(int i=N-2; i >= 0; i--) {
+        a *= (Ulong) 1 << 32;
+        a += (unsigned) v[i];
+      }
+      return a;
+    }
+    inline void conv_from_fraction(double d, bool *qb, bool *rbits, bool *o) {
+      iv_conv_from_fraction<N>(d, v, qb, rbits, o);
+    }
+
+    template<int N2, int Nr>
+    inline void mult(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_mult<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void add(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_add<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void sub(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_sub<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int Num_s, int N2, int Den_s, int Nr>
+    void div(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_div<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
+    }
+    template<int Num_s, int N2, int Den_s, int Nr>
+    void rem(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_rem<N,Num_s,N2,Den_s,Nr>(v, op2.v, r.v);
+    }
+    void increment() {
+      iv_uadd_carry<N>(v, true, v);
+    }
+    void decrement() {
+      iv_sub_int_borrow<N>(v, 0, true, v);
+    }
+    template<int Nr>
+    void neg(iv<Nr> &r) const {
+      iv_neg<N,Nr>(v, r.v);
+    }
+    template<int Nr>
+    void shift_l(unsigned op2, iv<Nr> &r) const {
+      iv_shift_l<N,Nr>(v, op2, r.v);
+    }
+    template<int Nr>
+    void shift_l2(signed op2, iv<Nr> &r) const {
+      iv_shift_l2<N,Nr,true>(v, op2, r.v);
+    }
+    template<int Nr>
+    void shift_r(unsigned op2, iv<Nr> &r) const {
+      iv_shift_r<N,Nr>(v, op2, r.v);
+    }
+    template<int Nr>
+    void shift_r2(signed op2, iv<Nr> &r) const {
+      iv_shift_r2<N,Nr,true>(v, op2, r.v);
+    }
+    template<int Nr, int B>
+    void const_shift_l(iv<Nr> &r) const {
+      iv_const_shift_l<N,Nr,B>(v, r.v);
+    }
+    template<int Nr, int B>
+    void const_shift_r(iv<Nr> &r) const {
+      iv_const_shift_r<N,Nr,B>(v, r.v);
+    }
+    template<int Nr>
+    void bitwise_complement(iv<Nr> &r) const {
+      iv_bitwise_complement<N,Nr>(v, r.v);
+    }
+    template<int N2, int Nr>
+    void bitwise_and(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_bitwise_and<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void bitwise_or(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_bitwise_or<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2, int Nr>
+    void bitwise_xor(const iv<N2> &op2, iv<Nr> &r) const {
+      iv_bitwise_xor<N,N2,Nr>(v, op2.v, r.v);
+    }
+    template<int N2>
+    bool equal(const iv<N2> &op2) const {
+      return iv_equal<N,N2>(v, op2.v);
+    }
+    template<int N2>
+    bool greater_than(const iv<N2> &op2) const {
+      return iv_compare<N,N2,true>(v, op2.v);
+    }
+    template<int N2>
+    bool less_than(const iv<N2> &op2) const {
+      return iv_compare<N,N2,false>(v, op2.v);
+    }
+    bool equal_zero() const {
+      return iv_equal_zero<N>(v);
+    }
+    template<int N2>
+    void set_slc(unsigned lsb, int WS, const iv<N2> &op2) {
+      AC_ASSERT((31+WS)/32 == N2, "Bad usage: WS greater than length of slice");
+      unsigned msb = lsb+WS-1;
+      unsigned lsb_v = lsb >> 5;
+      unsigned lsb_b = lsb & 31;
+      unsigned msb_v = msb >> 5;
+      unsigned msb_b = msb & 31;
+      if(N2==1) {
+        if(msb_v == lsb_v)
+          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb_b);
+        else {
+          v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
+          unsigned m = (((unsigned) op2.v[0] >> 1) >> (31-lsb_b));
+          v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
+        }
+      } else {
+        v[lsb_v] ^= (v[lsb_v] ^ ((unsigned) op2.v[0] << lsb_b)) & (all_ones << lsb_b);
+        for(int i = 1; i < N2-1; i++)
+          v[lsb_v+i] = ((unsigned) op2.v[i] << lsb_b) | (((unsigned) op2.v[i-1] >> 1) >> (31-lsb_b));
+        unsigned t = ((unsigned) op2.v[N2-1] << lsb_b) | (((unsigned) op2.v[N2-2] >> 1) >> (31-lsb_b));
+        unsigned m;
+        if(msb_v-lsb_v == N2) {
+          v[msb_v-1] = t;
+          m = (((unsigned) op2.v[N2-1] >> 1) >> (31-lsb_b));
+        }
+        else
+          m = t;
+        v[msb_v] ^= (v[msb_v] ^ m) & ~((all_ones<<1)<<msb_b);
+      }
+    }
+    unsigned leading_bits(bool bit) const {
+      return iv_leading_bits<N>(v, bit);
+    }
+  };
+
+  template<> inline Slong iv<1>::to_int64() const { return v[0]; }
+  template<> inline Ulong iv<1>::to_uint64() const { return v[0]; }
+
+  template<> inline Slong iv<2>::to_int64() const {
+    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
+  }
+  template<> inline Ulong iv<2>::to_uint64() const {
+    return ((Ulong)v[1] << 32) | (Ulong) (unsigned) v[0];
+  }
+
+  template<> template<> inline void iv<1>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
+    v[0] ^= (v[0] ^ ((unsigned) op2.v[0] << lsb)) & (~(WS==32 ? 0 : all_ones<<WS) << lsb);
+  }
+  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<1> &op2) {
+    Ulong l = to_uint64();
+    Ulong l2 = op2.to_uint64();
+    l ^= (l ^ (l2 << lsb)) & (~((~(Ulong)0)<<WS) << lsb);  // WS <= 32
+    *this = l;
+  }
+  template<> template<> inline void iv<2>::set_slc(unsigned lsb, int WS, const iv<2> &op2) {
+    Ulong l = to_uint64();
+    Ulong l2 = op2.to_uint64();
+    l ^= (l ^ (l2 << lsb)) & (~(WS==64 ? (Ulong) 0 : ~(Ulong)0<<WS) << lsb);
+    *this = l;
+  }
+
+  // add automatic conversion to Slong/Ulong depending on S and C
+  template<int N, bool S, bool C>
+  class iv_conv : public iv<N> {
+  protected:
+    iv_conv() {}
+    template<class T> iv_conv(const T& t) : iv<N>(t) {}
+  };
+
+  template<int N>
+  class iv_conv<N,false,true> : public iv<N> {
+  public:
+    operator Ulong () const { return iv<N>::to_uint64(); }
+  protected:
+    iv_conv() {}
+    template<class T> iv_conv(const T& t) : iv<N>(t) {}
+  };
+
+  template<int N>
+  class iv_conv<N,true,true> : public iv<N> {
+  public:
+    operator Slong () const { return iv<N>::to_int64(); }
+  protected:
+    iv_conv() {}
+    template<class T> iv_conv(const T& t) : iv<N>(t) {}
+  };
+
+  // Set default to promote to int as this is the case for almost all types
+  //  create exceptions using specializations
+  template<typename T>
+  struct c_prom {
+    typedef int promoted_type;
+  };
+  template<> struct c_prom<unsigned> {
+    typedef unsigned promoted_type;
+  };
+  template<> struct c_prom<long> {
+    typedef long promoted_type;
+  };
+  template<> struct c_prom<unsigned long> {
+    typedef unsigned long promoted_type;
+  };
+  template<> struct c_prom<Slong> {
+    typedef Slong promoted_type;
+  };
+  template<> struct c_prom<Ulong> {
+    typedef Ulong promoted_type;
+  };
+  template<> struct c_prom<float> {
+    typedef float promoted_type;
+  };
+  template<> struct c_prom<double> {
+    typedef double promoted_type;
+  };
+
+  template<typename T, typename T2>
+  struct c_arith {
+     // will error out for pairs of T and T2 that are not defined through specialization
+  };
+  template<typename T> struct c_arith<T,T> {
+    typedef T arith_conv;
+  };
+
+  #define C_ARITH(C_TYPE1, C_TYPE2) \
+  template<> struct c_arith<C_TYPE1, C_TYPE2> { \
+    typedef C_TYPE1 arith_conv; \
+  }; \
+  template<> struct c_arith<C_TYPE2, C_TYPE1> { \
+    typedef C_TYPE1 arith_conv; \
+  };
+
+  C_ARITH(double, float)
+  C_ARITH(double, int)
+  C_ARITH(double, unsigned)
+  C_ARITH(double, long)
+  C_ARITH(double, unsigned long)
+  C_ARITH(double, Slong)
+  C_ARITH(double, Ulong)
+  C_ARITH(float, int)
+  C_ARITH(float, unsigned)
+  C_ARITH(float, long)
+  C_ARITH(float, unsigned long)
+  C_ARITH(float, Slong)
+  C_ARITH(float, Ulong)
+
+  C_ARITH(Slong, int)
+  C_ARITH(Slong, unsigned)
+  C_ARITH(Ulong, int)
+  C_ARITH(Ulong, unsigned)
+
+  template<typename T>
+  struct map {
+    typedef T t;
+  };
+  template<typename T>
+  struct c_type_params {
+    // will error out for T for which this template struct is not specialized
+  };
+
+  template<typename T> inline const char *c_type_name() { return "unknown"; }
+  template<> inline const char *c_type_name<bool>() { return "bool";}
+  template<> inline const char *c_type_name<char>() { return "char";}
+  template<> inline const char *c_type_name<signed char>() { return "signed char";}
+  template<> inline const char *c_type_name<unsigned char>() { return "unsigned char";}
+  template<> inline const char *c_type_name<signed short>() { return "signed short";}
+  template<> inline const char *c_type_name<unsigned short>() { return "unsigned short";}
+  template<> inline const char *c_type_name<int>() { return "int";}
+  template<> inline const char *c_type_name<unsigned>() { return "unsigned";}
+  template<> inline const char *c_type_name<signed long>() { return "signed long";}
+  template<> inline const char *c_type_name<unsigned long>() { return "unsigned long";}
+  template<> inline const char *c_type_name<signed long long>() { return "signed long long";}
+  template<> inline const char *c_type_name<unsigned long long>() { return "unsigned long long";}
+  template<> inline const char *c_type_name<float>() { return "float";}
+  template<> inline const char *c_type_name<double>() { return "double";}
+
+  template<typename T> struct c_type;
+
+  template<typename T>
+  struct rt_c_type_T {
+    template<typename T2>
+    struct op1 {
+      typedef typename T::template rt_T< c_type<T2> >::mult mult;
+      typedef typename T::template rt_T< c_type<T2> >::plus plus;
+      typedef typename T::template rt_T< c_type<T2> >::minus2 minus;
+      typedef typename T::template rt_T< c_type<T2> >::minus minus2;
+      typedef typename T::template rt_T< c_type<T2> >::logic logic;
+      typedef typename T::template rt_T< c_type<T2> >::div2 div;
+      typedef typename T::template rt_T< c_type<T2> >::div div2;
+    };
+  };
+  template<typename T>
+  struct c_type {
+    typedef typename c_prom<T>::promoted_type c_prom_T;
+    struct rt_unary {
+      typedef c_prom_T neg;
+      typedef c_prom_T mag_sqr;
+      typedef c_prom_T mag;
+      template<unsigned N>
+      struct set {
+        typedef c_prom_T sum;
+      };
+    };
+    template<typename T2>
+    struct rt_T {
+      typedef typename rt_c_type_T<T2>::template op1<T>::mult mult;
+      typedef typename rt_c_type_T<T2>::template op1<T>::plus plus;
+      typedef typename rt_c_type_T<T2>::template op1<T>::minus minus;
+      typedef typename rt_c_type_T<T2>::template op1<T>::minus2 minus2;
+      typedef typename rt_c_type_T<T2>::template op1<T>::logic logic;
+      typedef typename rt_c_type_T<T2>::template op1<T>::div div;
+      typedef typename rt_c_type_T<T2>::template op1<T>::div2 div2;
+    };
+    inline static std::string type_name() {
+      std::string r = c_type_name<T>();
+      return r;
+    }
+
+  };
+  // with T == c_type
+  template<typename T>
+  struct rt_c_type_T< c_type<T> > {
+    typedef typename c_prom<T>::promoted_type c_prom_T;
+    template<typename T2>
+    struct op1 {
+      typedef typename c_prom<T2>::promoted_type c_prom_T2;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv mult;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv plus;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv minus2;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv logic;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div;
+      typedef typename c_arith< c_prom_T, c_prom_T2 >::arith_conv div2;
+    };
+  };
+
+  #define C_TYPE_MAP(C_TYPE) \
+  template<> struct map<C_TYPE> { \
+    typedef c_type<C_TYPE> t; \
+  };
+
+  #define C_TYPE_PARAMS(C_TYPE, WI, SI) \
+  template<> struct c_type_params<C_TYPE> { \
+    enum { W = WI, I = WI, E = 0, S = SI, floating_point = 0 }; \
+  };
+
+  #define C_TYPE_MAP_INT(C_TYPE, WI, SI) \
+    C_TYPE_MAP(C_TYPE) \
+    C_TYPE_PARAMS(C_TYPE, WI, SI)
+
+  #define C_TYPE_MAP_FLOAT(C_TYPE, FP, WFP, IFP, EFP) \
+  C_TYPE_MAP(C_TYPE) \
+  template<> struct c_type_params<C_TYPE> { \
+    enum { W = WFP, I = IFP, E = EFP, S = true, floating_point = FP }; \
+  };
+
+  C_TYPE_MAP_INT(bool, 1, false)
+  C_TYPE_MAP_INT(char, 8, true)
+  C_TYPE_MAP_INT(signed char, 8, true)
+  C_TYPE_MAP_INT(unsigned char, 8, false)
+  C_TYPE_MAP_INT(signed short, 16, true)
+  C_TYPE_MAP_INT(unsigned short, 16, false)
+  C_TYPE_MAP_INT(signed int, 32, true)
+  C_TYPE_MAP_INT(unsigned int, 32, false)
+  C_TYPE_MAP_INT(signed long, ac_private::long_w, true)
+  C_TYPE_MAP_INT(unsigned long, ac_private::long_w, false)
+  C_TYPE_MAP_INT(signed long long, 64, true)
+  C_TYPE_MAP_INT(unsigned long long, 64, false)
+  C_TYPE_MAP_FLOAT(float, 1, 25, 1, 8)
+  C_TYPE_MAP_FLOAT(double, 2, 54, 1, 11)
+
+  #undef C_TYPE_INT
+  #undef C_TYPE_PARAMS
+  #undef C_TYPE_FLOAT
+  #undef C_TYPE_MAP
+
+  // specializations for following struct declared/defined after definition of ac_int
+  template<typename T>
+  struct rt_ac_int_T {
+    template<int W, bool S>
+    struct op1 {
+      typedef typename T::template rt_T< ac_int<W,S> >::mult mult;
+      typedef typename T::template rt_T< ac_int<W,S> >::plus plus;
+      typedef typename T::template rt_T< ac_int<W,S> >::minus2 minus;
+      typedef typename T::template rt_T< ac_int<W,S> >::minus minus2;
+      typedef typename T::template rt_T< ac_int<W,S> >::logic logic;
+      typedef typename T::template rt_T< ac_int<W,S> >::div2 div;
+      typedef typename T::template rt_T< ac_int<W,S> >::div div2;
+    };
+  };
+}
+
+namespace ac {
+  // compiler time constant for log2 like functions
+  template<unsigned X>
+  struct nbits {
+    enum { val = X ? ac_private::s_N<16>::s_X<X>::nbits : 1 };
+  };
+
+  template<unsigned X>
+  struct log2_floor {
+    enum { val = nbits<X>::val - 1 };
+  };
+
+  // log2 of 0 is not defined: generate compiler error
+  template<> struct log2_floor<0> {};
+
+  template<unsigned X>
+  struct log2_ceil {
+    enum { lf = log2_floor<X>::val, val = (X == (1 << lf) ? lf : lf+1) };
+  };
+
+  // log2 of 0 is not defined: generate compiler error
+  template<> struct log2_ceil<0> {};
+
+  template<int LowerBound, int UpperBound>
+  struct int_range {
+    enum { l_s = (LowerBound < 0), u_s = (UpperBound < 0),
+           signedness = l_s || u_s,
+           l_nbits = nbits<AC_ABS(LowerBound+l_s)+l_s>::val,
+           u_nbits = nbits<AC_ABS(UpperBound+u_s)+u_s>::val,
+           nbits = AC_MAX(l_nbits, u_nbits + (!u_s && signedness))
+         };
+    typedef ac_int<nbits, signedness> type;
+  };
+
+  template<int W, int P, bool Is_MSB, bool S>
+  class sliceref {
+# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+# pragma builtin
+# endif
+    int *d_iv;
+    template<int W2, int P2, bool Is_MSB2, bool S2> friend class sliceref;
+  public:
+    sliceref( int *iv ) : d_iv(iv) {}
+
+    inline const sliceref operator = ( const sliceref &val ) {
+      return operator=<P,Is_MSB,S>(val);
+    }
+
+    template<int P2, bool Is_MSB2, bool S2>
+    inline const sliceref operator = ( const sliceref<W,P2,Is_MSB2,S2> &val ) {
+      const int src_lsi = P2/32;
+      const int src_msi = (P2+W-1)/32;
+      const int trg_lsi = P/32;
+      const int trg_msi = (P+W-1)/32;
+      const int trg_lsb = P&31;
+      const int trg_msb = (P+W-1)&31;
+      const int N = src_msi-src_lsi+1;
+      const int Nr = trg_msi-trg_lsi+1;
+      const int rshift = (P2&31) - (P&31);
+      int shifted_src[Nr];
+      int *aligned_src = val.d_iv+src_lsi;
+      if(rshift) {
+        if(rshift < 0)
+          ac_private::iv_shift_l<N,Nr>(aligned_src, -rshift, shifted_src);
+        else
+          ac_private::iv_shift_r<N,Nr>(aligned_src, rshift, shifted_src);
+        aligned_src = shifted_src;
+      }
+      unsigned mask_lsi = ac_private::all_ones << trg_lsb;
+      unsigned mask_msi = ac_private::all_ones >> (31-trg_msb);
+      if(Nr==1)
+        mask_lsi &= mask_msi;
+      int *v = d_iv+trg_lsi;
+      v[0] ^= (v[0] ^ ((unsigned) aligned_src[0])) & mask_lsi;
+      for(int k=1; k < Nr-1; k++)
+        v[k] = aligned_src[k];
+      if(Nr > 1)
+        v[Nr-1] ^= (v[Nr-1] ^ ((unsigned) aligned_src[Nr-1])) & mask_msi;
+      if(Is_MSB) {
+        const unsigned rem = 31-trg_msb;
+        if(rem) {
+          v[Nr-1] =  S ? ((signed) ((unsigned) v[Nr-1]  << rem) >> rem)
+                       : ((unsigned) v[Nr-1]  << rem) >> rem;
+        } else if(!S) {
+          v[Nr] = 0;
+        }
+      }
+      return *this;
+    }
+  };
+}
+
+enum ac_q_mode { AC_TRN, AC_RND, AC_TRN_ZERO, AC_RND_ZERO, AC_RND_INF, AC_RND_MIN_INF, AC_RND_CONV, AC_RND_CONV_ODD };
+enum ac_o_mode { AC_WRAP, AC_SAT, AC_SAT_ZERO, AC_SAT_SYM };
+template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> class ac_fixed;
+
+//////////////////////////////////////////////////////////////////////////////
+//  Arbitrary-Length Integer: ac_int
+//////////////////////////////////////////////////////////////////////////////
+
+template<int W, bool S=true>
+class ac_int : public ac_private::iv_conv<(W+31+!S)/32, S, W<=64>
+#ifndef __SYNTHESIS__
+__AC_INT_UTILITY_BASE
+#endif
+{
+#if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+#pragma builtin
+#endif
+
+  enum {N=(W+31+!S)/32};
+  typedef ac_private::iv_conv<N, S, W <= 64> ConvBase;
+  typedef ac_private::iv<N>                  Base;
+
+  inline void bit_adjust() {
+    const unsigned rem = (32-W)&31;
+    Base::v[N-1] =  S ? ((signed) ((unsigned) Base::v[N-1]  << rem) >> rem) : (rem ?
+                  ((unsigned) Base::v[N-1]  << rem) >> rem : 0);
+  }
+
+  inline bool is_neg() const { return S && Base::v[N-1] < 0; }
+
+  // returns false if number is denormal
+  template<int WE, bool SE>
+  bool normalize_private(ac_int<WE,SE> &exp, bool reserved_min_exp=false) {
+    int expt = exp;
+    int lshift = leading_sign();
+    bool fully_normalized = true;
+    ac_int<WE, SE> min_exp;
+    min_exp.template set_val<AC_VAL_MIN>();
+    int max_shift = exp - min_exp - reserved_min_exp;
+    if(lshift > max_shift) {
+      lshift = ac_int<WE,false>(max_shift);
+      expt = min_exp + reserved_min_exp;
+      fully_normalized = false;
+    } else {
+      expt -= lshift;
+    }
+    if(Base::equal_zero()) {
+      expt = 0;
+      fully_normalized = true;
+    }
+    exp = expt;
+    Base r;
+    Base::shift_l(lshift, r);
+    Base::operator=(r);
+    bit_adjust();
+    return fully_normalized;
+  }
+
+public:
+  static const int width = W;
+  static const int i_width = W;
+  static const bool sign = S;
+  static const ac_q_mode q_mode = AC_TRN;
+  static const ac_o_mode o_mode = AC_WRAP;
+  static const int e_width = 0;
+
+  template<int W2, bool S2>
+  struct rt {
+    enum {
+      mult_w = W+W2,
+      mult_s = S||S2,
+      plus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
+      plus_s = S||S2,
+      minus_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2))+1,
+      minus_s = true,
+      div_w = W+S2,
+      div_s = S||S2,
+      mod_w = AC_MIN(W,W2+(!S2&&S)),
+      mod_s = S,
+      logic_w = AC_MAX(W+(S2&&!S),W2+(S&&!S2)),
+      logic_s = S||S2
+    };
+    typedef ac_int<mult_w, mult_s> mult;
+    typedef ac_int<plus_w, plus_s> plus;
+    typedef ac_int<minus_w, minus_s> minus;
+    typedef ac_int<logic_w, logic_s> logic;
+    typedef ac_int<div_w, div_s> div;
+    typedef ac_int<mod_w, mod_s> mod;
+    typedef ac_int<W, S> arg1;
+  };
+
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::mult mult;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::plus plus;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus minus;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::minus2 minus2;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::logic logic;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div div;
+    typedef typename ac_private::rt_ac_int_T<map_T>::template op1<W,S>::div2 div2;
+    typedef ac_int<W, S> arg1;
+  };
+
+  struct rt_unary {
+    enum {
+      neg_w = W+1,
+      neg_s = true,
+      mag_sqr_w = 2*W-S,
+      mag_sqr_s = false,
+      mag_w = W+S,
+      mag_s = false,
+      leading_sign_w = ac::log2_ceil<W+!S>::val,
+      leading_sign_s = false
+    };
+    typedef ac_int<neg_w, neg_s> neg;
+    typedef ac_int<mag_sqr_w, mag_sqr_s> mag_sqr;
+    typedef ac_int<mag_w, mag_s> mag;
+    typedef ac_int<leading_sign_w, leading_sign_s> leading_sign;
+    template<unsigned N>
+    struct set {
+      enum { sum_w = W + ac::log2_ceil<N>::val, sum_s = S};
+      typedef ac_int<sum_w, sum_s> sum;
+    };
+  };
+
+  template<int W2, bool S2> friend class ac_int;
+  template<int W2, int I2, bool S2, ac_q_mode Q2, ac_o_mode O2> friend class ac_fixed;
+  ac_int() {
+#if !defined(__SYNTHESIS__) && defined(AC_DEFAULT_IN_RANGE)
+    bit_adjust();
+#endif
+  }
+  template<int W2, bool S2>
+  inline ac_int (const ac_int<W2,S2> &op) {
+    Base::operator =(op);
+    bit_adjust();
+  }
+
+  inline ac_int( bool b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( char b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed char b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned char b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed short b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned short b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed int b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned int b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( signed long b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( unsigned long b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( Slong b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( Ulong b ) : ConvBase(b) { bit_adjust(); }
+  inline ac_int( double d ) : ConvBase(d) { bit_adjust(); }
+
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( push )
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+  template<ac_special_val V>
+  inline ac_int &set_val() {
+    const unsigned int all_ones = (unsigned) ~0;
+    if(V == AC_VAL_DC) {
+      ac_int r;
+      Base::operator =(r);
+      bit_adjust();
+    }
+    else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+      Base::operator =(0);
+      if(S && V == AC_VAL_MIN) {
+        const unsigned int rem = (W-1)&31;
+        Base::v[N-1] = (all_ones << rem);
+      } else if(V == AC_VAL_QUANTUM)
+        Base::v[0] = 1;
+    }
+    else {  // AC_VAL_MAX
+      Base::operator =(-1);
+      const unsigned int rem = (32-W - !S )&31;
+      Base::v[N-1] = (all_ones >> 1) >> rem;
+    }
+    return *this;
+  }
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+  // Explicit conversion functions to C built-in types -------------
+  inline int to_int() const { return Base::v[0]; }
+  inline unsigned to_uint() const { return Base::v[0]; }
+  inline long to_long() const {
+    return ac_private::long_w == 32 ? (long) Base::v[0] : (long) Base::to_int64();
+  }
+  inline unsigned long to_ulong() const {
+    return ac_private::long_w == 32 ? (unsigned long) Base::v[0] : (unsigned long) Base::to_uint64();
+  }
+  inline Slong to_int64() const { return Base::to_int64(); }
+  inline Ulong to_uint64() const { return Base::to_uint64(); }
+  inline double to_double() const { return Base::to_double(); }
+
+  inline int length() const { return W; }
+
+  inline std::string to_string(ac_base_mode base_rep, bool sign_mag = false) const {
+    // base_rep == AC_DEC => sign_mag == don't care (always print decimal in sign magnitude)
+    char r[N*32+4] = {0};
+    int i = 0;
+    if(sign_mag)
+      r[i++] = is_neg() ? '-' : '+';
+    else if (base_rep == AC_DEC && is_neg())
+      r[i++] = '-';
+    if(base_rep != AC_DEC) {
+      r[i++] = '0';
+      r[i++] = base_rep == AC_BIN ? 'b' : (base_rep == AC_OCT ? 'o' : 'x');
+    }
+    int str_w;
+    if( (base_rep == AC_DEC || sign_mag) && is_neg() ) {
+      ac_int<W, false>  mag = operator -();
+      str_w = ac_private::to_string(mag.v, W+1, sign_mag, base_rep, false, r+i);
+    } else {
+      ac_int<W,S> tmp = *this;
+      str_w = ac_private::to_string(tmp.v, W+!S, sign_mag, base_rep, false, r+i);
+    }
+    if(!str_w) {
+      r[i] = '0';
+      r[i+1] = 0;
+    }
+    return std::string(r);
+  }
+  inline static std::string type_name() {
+    const char *tf[] = {",false>", ",true>"};
+    std::string r = "ac_int<";
+    r += ac_int<32,true>(W).to_string(AC_DEC);
+    r += tf[S];
+    return r;
+  }
+
+  // Arithmetic : Binary ----------------------------------------------------
+  template<int W2, bool S2>
+  typename rt<W2,S2>::mult operator *( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::mult r;
+    Base::mult(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::plus operator +( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::plus r;
+    Base::add(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::minus operator -( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::minus r;
+    Base::sub(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wenum-compare"
+#endif
+  template<int W2, bool S2>
+  typename rt<W2,S2>::div operator /( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::div r;
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::div::N };
+    Base::template div<num_s, N2, den_s, Nr>(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::mod operator %( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::mod r;
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = rt<W2,S2>::mod::N };
+    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
+    return r;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+  // Arithmetic assign  ------------------------------------------------------
+  template<int W2, bool S2>
+  ac_int &operator *=( const ac_int<W2,S2> &op2) {
+    Base r;
+    Base::mult(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator +=( const ac_int<W2,S2> &op2) {
+    Base r;
+    Base::add(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator -=( const ac_int<W2,S2> &op2) {
+    Base r;
+    Base::sub(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wenum-compare"
+#endif
+  template<int W2, bool S2>
+  ac_int &operator /=( const ac_int<W2,S2> &op2) {
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
+    Base r;
+    Base::template div<num_s, N2, den_s, Nr>(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator %=( const ac_int<W2,S2> &op2) {
+    enum {Nminus = ac_int<W+S,S>::N, N2 = ac_int<W2,S2>::N, N2minus = ac_int<W2+S2,S2>::N,
+          num_s = S + (Nminus > N), den_s = S2 + (N2minus > N2), Nr = N };
+    Base r;
+    Base::template rem<num_s, N2, den_s, Nr>(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+  // Arithmetic prefix increment, decrement ----------------------------------
+  ac_int &operator ++() {
+    Base::increment();
+    bit_adjust();
+    return *this;
+  }
+  ac_int &operator --() {
+    Base::decrement();
+    bit_adjust();
+    return *this;
+  }
+  // Arithmetic postfix increment, decrement ---------------------------------
+  const ac_int operator ++(int) {
+    ac_int t = *this;
+    Base::increment();
+    bit_adjust();
+    return t;
+  }
+  const ac_int operator --(int) {
+    ac_int t = *this;
+    Base::decrement();
+    bit_adjust();
+    return t;
+  }
+  // Arithmetic Unary --------------------------------------------------------
+  ac_int operator +() {
+    return *this;
+  }
+  typename rt_unary::neg operator -() const {
+    typename rt_unary::neg r;
+    Base::neg(r);
+    r.bit_adjust();
+    return r;
+  }
+  // ! ------------------------------------------------------------------------
+  bool operator ! () const {
+    return Base::equal_zero();
+  }
+
+  // Bitwise (arithmetic) unary: complement  -----------------------------
+  ac_int<W+!S, true> operator ~() const {
+    ac_int<W+!S, true> r;
+    Base::bitwise_complement(r);
+    return r;
+  }
+  // Bitwise (non-arithmetic) bit_complement  -----------------------------
+  ac_int<W, false> bit_complement() const {
+    ac_int<W, false> r;
+    Base::bitwise_complement(r);
+    r.bit_adjust();
+    return r;
+  }
+  // Bitwise (arithmetic): and, or, xor ----------------------------------
+  template<int W2, bool S2>
+  typename rt<W2,S2>::logic operator & ( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::logic r;
+    Base::bitwise_and(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::logic operator | ( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::logic r;
+    Base::bitwise_or(op2, r);
+    return r;
+  }
+  template<int W2, bool S2>
+  typename rt<W2,S2>::logic operator ^ ( const ac_int<W2,S2> &op2) const {
+    typename rt<W2,S2>::logic r;
+    Base::bitwise_xor(op2, r);
+    return r;
+  }
+  // Bitwise assign (not arithmetic): and, or, xor ----------------------------
+  template<int W2, bool S2>
+  ac_int &operator &= ( const ac_int<W2,S2> &op2 ) {
+    Base r;
+    Base::bitwise_and(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator |= ( const ac_int<W2,S2> &op2 ) {
+    Base r;
+    Base::bitwise_or(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2, bool S2>
+  ac_int &operator ^= ( const ac_int<W2,S2> &op2 ) {
+    Base r;
+    Base::bitwise_xor(op2, r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  // Shift (result constrained by left operand) -------------------------------
+  template<int W2>
+  ac_int operator << ( const ac_int<W2,true> &op2 ) const {
+    ac_int r;
+    Base::shift_l2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_int operator << ( const ac_int<W2,false> &op2 ) const {
+    ac_int r;
+    Base::shift_l(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_int operator >> ( const ac_int<W2,true> &op2 ) const {
+    ac_int r;
+    Base::shift_r2(op2.to_int(), r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int W2>
+  ac_int operator >> ( const ac_int<W2,false> &op2 ) const {
+    ac_int r;
+    Base::shift_r(op2.to_uint(), r);
+    r.bit_adjust();
+    return r;
+  }
+  // Shift assign ------------------------------------------------------------
+  template<int W2>
+  ac_int &operator <<= ( const ac_int<W2,true> &op2 ) {
+    Base r;
+    Base::shift_l2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_int &operator <<= ( const ac_int<W2,false> &op2 ) {
+    Base r;
+    Base::shift_l(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_int &operator >>= ( const ac_int<W2,true> &op2 ) {
+    Base r;
+    Base::shift_r2(op2.to_int(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  template<int W2>
+  ac_int &operator >>= ( const ac_int<W2,false> &op2 ) {
+    Base r;
+    Base::shift_r(op2.to_uint(), r);
+    Base::operator=(r);
+    bit_adjust();
+    return *this;
+  }
+  // Relational ---------------------------------------------------------------
+  template<int W2, bool S2>
+  bool operator == ( const ac_int<W2,S2> &op2) const {
+    return Base::equal(op2);
+  }
+  template<int W2, bool S2>
+  bool operator != ( const ac_int<W2,S2> &op2) const {
+    return !Base::equal(op2);
+  }
+  template<int W2, bool S2>
+  bool operator < ( const ac_int<W2,S2> &op2) const {
+    return Base::less_than(op2);
+  }
+  template<int W2, bool S2>
+  bool operator >= ( const ac_int<W2,S2> &op2) const {
+    return !Base::less_than(op2);
+  }
+  template<int W2, bool S2>
+  bool operator > ( const ac_int<W2,S2> &op2) const {
+    return Base::greater_than(op2);
+  }
+  template<int W2, bool S2>
+  bool operator <= ( const ac_int<W2,S2> &op2) const {
+    return !Base::greater_than(op2);
+  }
+
+  // Bit and Slice Select -----------------------------------------------------
+  template<int WS, int WX, bool SX>
+  inline const ac_int<WS,S> slc(const ac_int<WX,SX> &index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = ac_int<WX-SX, false>(index).to_uint();
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int WS>
+  inline const ac_int<WS,S> slc(signed index) const {
+    ac_int<WS,S> r;
+    AC_ASSERT(index >= 0, "Attempting to read slc with negative indeces");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+  template<int WS>
+  inline const ac_int<WS,S> slc(unsigned uindex) const {
+    ac_int<WS,S> r;
+    Base::shift_r(uindex, r);
+    r.bit_adjust();
+    return r;
+  }
+
+  template<int W2, bool S2, int WX, bool SX>
+  inline ac_int &set_slc(const ac_int<WX,SX> lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb.to_int() + W2 <= W && lsb.to_int() >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = ac_int<WX-SX, false>(lsb).to_uint();
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_int &set_slc(signed lsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(lsb + W2 <= W && lsb >= 0, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else {
+      unsigned ulsb = lsb & ((unsigned)~0 >> 1);
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    }
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+  template<int W2, bool S2>
+  inline ac_int &set_slc(unsigned ulsb, const ac_int<W2,S2> &slc) {
+    AC_ASSERT(ulsb + W2 <= W, "Out of bounds set_slc");
+    if(W == W2)
+      Base::operator =(slc);
+    else
+      Base::set_slc(ulsb, W2, (ac_int<W2,true>) slc);
+    bit_adjust();   // in case sign bit was assigned
+    return *this;
+  }
+
+  template<int Msb, int Lsb>
+  inline ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S> range() {
+    #if __cplusplus > 199711L
+    static_assert(Msb-Lsb+1 > 0, "Range length not positive: MSB < LSB");
+    static_assert(Lsb >= 0, "LSB is negative");
+    static_assert(Msb < W, "MSB >= W");
+    #endif
+    return ac::sliceref<Msb-Lsb+1,Lsb,Msb==W-1,S>(Base::v);
+  }
+
+  class ac_bitref {
+# if defined(__SYNTHESIS__) && !defined(AC_IGNORE_BUILTINS)
+# pragma builtin
+# endif
+    ac_int &d_bv;
+    unsigned d_index;
+  public:
+    ac_bitref( ac_int *bv, unsigned index=0 ) : d_bv(*bv), d_index(index) {}
+    operator bool () const { return (d_index < W) ? (d_bv.v[d_index>>5]>>(d_index&31) & 1) : 0; }
+
+    template<int W2, bool S2>
+    operator ac_int<W2,S2> () const { return operator bool (); }
+
+    inline ac_bitref operator = ( int val ) {
+      // lsb of int (val&1) is written to bit
+      if(d_index < W) {
+        int *pval = &d_bv.v[d_index>>5];
+        *pval ^= (*pval ^ ( (unsigned) val << (d_index&31) )) & 1 << (d_index&31);
+        d_bv.bit_adjust();   // in case sign bit was assigned
+      }
+      return *this;
+    }
+    template<int W2, bool S2>
+    inline ac_bitref operator = ( const ac_int<W2,S2> &val ) {
+      return operator =(val.to_int());
+    }
+    inline ac_bitref operator = ( const ac_bitref &val ) {
+      return operator =((int) (bool) val);
+    }
+  };
+
+  ac_bitref operator [] ( unsigned int uindex) {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  ac_bitref operator [] ( int index) {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  template<int W2, bool S2>
+  ac_bitref operator [] ( const ac_int<W2,S2> &index) {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    ac_bitref bvh( this, uindex );
+    return bvh;
+  }
+  bool operator [] ( unsigned int uindex) const {
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  bool operator [] ( int index) const {
+    AC_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = index & ((unsigned)~0 >> 1);
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+  template<int W2, bool S2>
+  bool operator [] ( const ac_int<W2,S2> &index) const {
+    AC_ASSERT(index.to_int() >= 0, "Attempting to read bit with negative index");
+    unsigned uindex = ac_int<W2-S2,false>(index).to_uint();
+    AC_ASSERT(uindex < W, "Attempting to read bit beyond MSB");
+    return (uindex < W) ? (Base::v[uindex>>5]>>(uindex&31) & 1) : 0;
+  }
+
+  typename rt_unary::leading_sign leading_sign() const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    return ls;
+  }
+  typename rt_unary::leading_sign leading_sign(bool &all_sign) const {
+    unsigned ls = Base::leading_bits(S & (Base::v[N-1] < 0)) - (32*N - W)-S;
+    all_sign = (ls == W-S);
+    return ls;
+  }
+  // returns false if number is denormal
+  template<int WE, bool SE>
+  bool normalize(ac_int<WE,SE> &exp) {
+    return normalize_private(exp);
+  }
+  // returns false if number is denormal, minimum exponent is reserved (usually for encoding special values/errors)
+  template<int WE, bool SE>
+  bool normalize_RME(ac_int<WE,SE> &exp) {
+    return normalize_private(exp, true);
+  }
+  bool and_reduce() const {
+    return ac_private::iv_equal_ones_to<W,N>(Base::v);
+  }
+  bool or_reduce() const {
+    return !Base::equal_zero();
+  }
+  bool xor_reduce() const {
+    unsigned r = Base::v[N-1];
+    if(S) {
+      const unsigned rem = (32-W)&31;
+      r = (r << rem) >> rem;
+    }
+    if(N > 1)
+      r ^= Base::v[N-2];
+    if(N > 2) {
+      for(int i=0; i<N-2; i++)
+        r ^= Base::v[i];
+    }
+    if(W > 16)
+      r ^= r >> 16;
+    if(W > 8)
+      r ^= r >> 8;
+    if(W > 4)
+      r ^= r >> 4;
+    if(W > 2)
+      r ^= r >> 2;
+    if(W > 1)
+    r ^= r >> 1;
+    return r&1;
+  }
+
+  inline void bit_fill_hex(const char *str) {
+    // Zero Pads if str is too short, throws ms bits away if str is too long
+    // Asserts if anything other than 0-9a-fA-F is encountered
+    ac_int<W,S> res = 0;
+    while(*str) {
+      char c = *str;
+      int h = 0;
+      if(c >= '0' && c <= '9')
+        h = c - '0';
+      else if(c >= 'A' && c <= 'F')
+        h = c - 'A' + 10;
+      else if(c >= 'a' && c <= 'f')
+        h = c - 'a' + 10;
+      else {
+        AC_ASSERT(!c, "Invalid hex digit");
+        break;
+      }
+      res <<= ac_int<3,false>(4);
+      res |= ac_int<4,false>(h);
+      str++;
+    }
+    *this = res;
+  }
+
+  template<int Na>
+  inline void bit_fill(const int (&ivec)[Na], bool bigendian=true) {
+    // bit_fill from integer vector
+    //   if W > N*32, missing most significant bits are zeroed
+    //   if W < N*32, additional bits in ivec are ignored (no overflow checking)
+    // Example:
+    //   ac_int<80,false> x;    int vec[] = { 0xffffa987, 0x6543210f, 0xedcba987 };
+    //   x.bit_fill(vec);   // vec[0] fill bits 79-64
+    enum { N0 = (W+31)/32, M = AC_MIN(N0,Na) };
+    ac_int<M*32,false> res = 0;
+    for(int i=0; i < M; i++)
+      res.set_slc(i*32, ac_int<32>(ivec[bigendian ? M-1-i : i]));
+    *this = res;
+  }
+};
+
+namespace ac {
+  template<typename T, typename T2>
+  struct rt_2T {
+    typedef typename ac_private::map<T>::t map_T;
+    typedef typename ac_private::map<T2>::t map_T2;
+    typedef typename map_T::template rt_T< map_T2 >::mult mult;
+    typedef typename map_T::template rt_T< map_T2 >::plus plus;
+    typedef typename map_T::template rt_T< map_T2 >::minus minus;
+    typedef typename map_T::template rt_T< map_T2 >::minus2 minus2;
+    typedef typename map_T::template rt_T< map_T2 >::logic logic;
+    typedef typename map_T::template rt_T< map_T2 >::div div;
+    typedef typename map_T::template rt_T< map_T2 >::div2 div2;
+  };
+}
+
+namespace ac {
+  template<typename T>
+  struct ac_int_represent {
+    enum { t_w = ac_private::c_type_params<T>::W, t_s = ac_private::c_type_params<T>::S };
+    typedef ac_int<t_w,t_s> type;
+  };
+  template<> struct ac_int_represent<float> {};
+  template<> struct ac_int_represent<double> {};
+  template<int W, bool S>
+  struct ac_int_represent< ac_int<W,S> > {
+    typedef ac_int<W,S> type;
+  };
+}
+
+namespace ac_private {
+  template<int W2, bool S2>
+  struct rt_ac_int_T< ac_int<W2,S2> > {
+    typedef ac_int<W2,S2> i2_t;
+    template<int W, bool S>
+    struct op1 {
+      typedef ac_int<W,S> i_t;
+      typedef typename i_t::template rt<W2,S2>::mult mult;
+      typedef typename i_t::template rt<W2,S2>::plus plus;
+      typedef typename i_t::template rt<W2,S2>::minus minus;
+      typedef typename i2_t::template rt<W,S>::minus minus2;
+      typedef typename i_t::template rt<W2,S2>::logic logic;
+      typedef typename i_t::template rt<W2,S2>::div div;
+      typedef typename i2_t::template rt<W,S>::div div2;
+      typedef typename i_t::template rt<W2,S2>::mod mod;
+      typedef typename i2_t::template rt<W,S>::mod mod2;
+    };
+  };
+
+  template<typename T>
+  struct rt_ac_int_T< c_type<T> > {
+    typedef typename ac::ac_int_represent<T>::type i2_t;
+    enum { W2 = i2_t::width, S2 = i2_t::sign };
+    template<int W, bool S>
+    struct op1 {
+      typedef ac_int<W,S> i_t;
+      typedef typename i_t::template rt<W2,S2>::mult mult;
+      typedef typename i_t::template rt<W2,S2>::plus plus;
+      typedef typename i_t::template rt<W2,S2>::minus minus;
+      typedef typename i2_t::template rt<W,S>::minus minus2;
+      typedef typename i_t::template rt<W2,S2>::logic logic;
+      typedef typename i_t::template rt<W2,S2>::div div;
+      typedef typename i2_t::template rt<W,S>::div div2;
+      typedef typename i_t::template rt<W2,S2>::mod mod;
+      typedef typename i2_t::template rt<W,S>::mod mod2;
+    };
+  };
+}
+
+
+// Specializations for constructors on integers that bypass bit adjusting
+//  and are therefore more efficient
+template<> inline ac_int<1,true>::ac_int( bool b ) { v[0] = b ? -1 : 0; }
+
+template<> inline ac_int<1,false>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<1,false>::ac_int( signed char b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned char b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( signed short b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned short b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( signed int b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned int b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( signed long b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( unsigned long b ) { v[0] = b&1; }
+template<> inline ac_int<1,false>::ac_int( Ulong b ) { v[0] = (int) b&1; }
+template<> inline ac_int<1,false>::ac_int( Slong b ) { v[0] = (int) b&1; }
+
+template<> inline ac_int<8,true>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<8,false>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<8,true>::ac_int( signed char b ) { v[0] = b; }
+template<> inline ac_int<8,false>::ac_int( unsigned char b ) { v[0] = b; }
+template<> inline ac_int<8,true>::ac_int( unsigned char b ) { v[0] = (signed char) b; }
+template<> inline ac_int<8,false>::ac_int( signed char b ) { v[0] = (unsigned char) b; }
+
+template<> inline ac_int<16,true>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( bool b ) { v[0] = b; }
+template<> inline ac_int<16,true>::ac_int( signed char b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( unsigned char b ) { v[0] = b; }
+template<> inline ac_int<16,true>::ac_int( unsigned char b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( signed char b ) { v[0] = (unsigned short) b; }
+template<> inline ac_int<16,true>::ac_int( signed short b ) { v[0] = b; }
+template<> inline ac_int<16,false>::ac_int( unsigned short b ) { v[0] = b; }
+template<> inline ac_int<16,true>::ac_int( unsigned short b ) { v[0] = (signed short) b; }
+template<> inline ac_int<16,false>::ac_int( signed short b ) { v[0] = (unsigned short) b; }
+
+template<> inline ac_int<32,true>::ac_int( signed int b ) { v[0] = b; }
+template<> inline ac_int<32,true>::ac_int( unsigned int b ) { v[0] = b; }
+template<> inline ac_int<32,false>::ac_int( signed int b ) { v[0] = b; v[1] = 0;}
+template<> inline ac_int<32,false>::ac_int( unsigned int b ) { v[0] = b; v[1] = 0;}
+
+template<> inline ac_int<32,true>::ac_int( Slong b ) { v[0] = (int) b; }
+template<> inline ac_int<32,true>::ac_int( Ulong b ) { v[0] = (int) b; }
+template<> inline ac_int<32,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = 0;}
+template<> inline ac_int<32,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = 0;}
+
+template<> inline ac_int<64,true>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); }
+template<> inline ac_int<64,true>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32);}
+template<> inline ac_int<64,false>::ac_int( Slong b ) { v[0] = (int) b; v[1] = (int) ((Ulong) b >> 32); v[2] = 0; }
+template<> inline ac_int<64,false>::ac_int( Ulong b ) { v[0] = (int) b; v[1] = (int) (b >> 32); v[2] = 0; }
+
+// Stream --------------------------------------------------------------------
+
+template<int W, bool S>
+inline std::ostream& operator << (std::ostream &os, const ac_int<W,S> &x) {
+#ifndef __SYNTHESIS__
+  if ((os.flags() & std::ios::hex) != 0) {
+    os << x.to_string(AC_HEX);
+  } else if ((os.flags() & std::ios::oct) != 0) {
+    os << x.to_string(AC_OCT);
+  } else {
+    os << x.to_string(AC_DEC);
+  }
+#endif
+  return os;
+}
+
+// Macros for Binary Operators with Integers --------------------------------------------
+
+#define BIN_OP_WITH_INT(BIN_OP, C_TYPE, WI, SI, RTYPE)  \
+  template<int W, bool S> \
+  inline typename ac_int<WI,SI>::template rt<W,S>::RTYPE operator BIN_OP ( C_TYPE i_op, const ac_int<W,S> &op) {  \
+    return ac_int<WI,SI>(i_op).operator BIN_OP (op);  \
+  } \
+  template<int W, bool S>   \
+  inline typename ac_int<W,S>::template rt<WI,SI>::RTYPE operator BIN_OP ( const ac_int<W,S> &op, C_TYPE i_op) {  \
+    return op.operator BIN_OP (ac_int<WI,SI>(i_op));  \
+  }
+
+#define REL_OP_WITH_INT(REL_OP, C_TYPE, W2, S2)  \
+  template<int W, bool S>   \
+  inline bool operator REL_OP ( const ac_int<W,S> &op, C_TYPE op2) {  \
+    return op.operator REL_OP (ac_int<W2,S2>(op2));  \
+  }  \
+  template<int W, bool S> \
+  inline bool operator REL_OP ( C_TYPE op2, const ac_int<W,S> &op) {  \
+    return ac_int<W2,S2>(op2).operator REL_OP (op);  \
+  }
+
+#define ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, W2, S2)  \
+  template<int W, bool S>   \
+  inline ac_int<W,S> &operator ASSIGN_OP ( ac_int<W,S> &op, C_TYPE op2) {  \
+    return op.operator ASSIGN_OP (ac_int<W2,S2>(op2));  \
+  }
+
+#define OPS_WITH_INT(C_TYPE, WI, SI) \
+  BIN_OP_WITH_INT(*, C_TYPE, WI, SI, mult) \
+  BIN_OP_WITH_INT(+, C_TYPE, WI, SI, plus) \
+  BIN_OP_WITH_INT(-, C_TYPE, WI, SI, minus) \
+  BIN_OP_WITH_INT(/, C_TYPE, WI, SI, div) \
+  BIN_OP_WITH_INT(%, C_TYPE, WI, SI, mod) \
+  BIN_OP_WITH_INT(>>, C_TYPE, WI, SI, arg1) \
+  BIN_OP_WITH_INT(<<, C_TYPE, WI, SI, arg1) \
+  BIN_OP_WITH_INT(&, C_TYPE, WI, SI, logic) \
+  BIN_OP_WITH_INT(|, C_TYPE, WI, SI, logic) \
+  BIN_OP_WITH_INT(^, C_TYPE, WI, SI, logic) \
+  \
+  REL_OP_WITH_INT(==, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(!=, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(>, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(>=, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(<, C_TYPE, WI, SI) \
+  REL_OP_WITH_INT(<=, C_TYPE, WI, SI) \
+  \
+  ASSIGN_OP_WITH_INT(+=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(-=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(*=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(/=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(%=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(>>=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(<<=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(&=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(|=, C_TYPE, WI, SI) \
+  ASSIGN_OP_WITH_INT(^=, C_TYPE, WI, SI)
+
+// ------------------------------------- End of Macros for Binary Operators with Integers
+
+// for backward compatability with v3.9.0 and earlier define following macro
+#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
+namespace ac {
+  namespace ops_with_other_types {
+#endif
+//  Mixed Operators with Integers  -----------------------------------------------
+OPS_WITH_INT(bool, 1, false)
+OPS_WITH_INT(char, 8, true)
+OPS_WITH_INT(signed char, 8, true)
+OPS_WITH_INT(unsigned char, 8, false)
+OPS_WITH_INT(short, 16, true)
+OPS_WITH_INT(unsigned short, 16, false)
+OPS_WITH_INT(int, 32, true)
+OPS_WITH_INT(unsigned int, 32, false)
+OPS_WITH_INT(long, ac_private::long_w, true)
+OPS_WITH_INT(unsigned long, ac_private::long_w, false)
+OPS_WITH_INT(Slong, 64, true)
+OPS_WITH_INT(Ulong, 64, false)
+// -----------------------------------------  End of Mixed Operators with Integers
+#ifdef AC_INT_NS_FOR_MIXED_OPERATORS
+  }  // ops_with_other_types namespace
+}
+using namespace ac::ops_with_other_types;
+#endif
+
+namespace ac {
+  // Functions to fill bits
+
+  template<typename T>
+  inline T bit_fill_hex(const char *str) {
+    T res;
+    res.bit_fill_hex(str);
+    return res;
+  }
+
+  // returns bit_fill for type
+  //   example:
+  //   ac_int<80,false> x = ac::bit_fill< ac_int<80,false> > ((int [3]) {0xffffa987, 0x6543210f, 0xedcba987 });
+  template<typename T, int N>
+  inline T bit_fill(const int (&ivec)[N], bool bigendian=true) {
+    T res;
+    res.bit_fill(ivec, bigendian);
+    return res;
+  }
+
+}  // ac namespace
+
+//  Mixed Operators with Pointers  -----------------------------------------------
+
+// Addition of ac_int and  pointer
+template<typename T, int W, bool S>
+T *operator +(T *ptr, const ac_int<W,S> &op2) {
+  return ptr + op2.to_int64();
+}
+template<typename T, int W, bool S>
+T *operator +(const ac_int<W,S> &op2, T *ptr) {
+  return ptr + op2.to_int64();
+}
+// Subtraction of ac_int from pointer
+template<typename T, int W, bool S>
+T *operator -(T *ptr, const ac_int<W,S> &op2) {
+  return ptr - op2.to_int64();
+}
+// -----------------------------------------  End of Mixed Operators with Pointers
+
+namespace ac_intN {
+  ///////////////////////////////////////////////////////////////////////////////
+  //  Predefined for ease of use
+  ///////////////////////////////////////////////////////////////////////////////
+  typedef ac_int<1,          true>   int1;
+  typedef ac_int<1,          false>  uint1;
+  typedef ac_int<2,          true>   int2;
+  typedef ac_int<2,          false>  uint2;
+  typedef ac_int<3,          true>   int3;
+  typedef ac_int<3,          false>  uint3;
+  typedef ac_int<4,          true>   int4;
+  typedef ac_int<4,          false>  uint4;
+  typedef ac_int<5,          true>   int5;
+  typedef ac_int<5,          false>  uint5;
+  typedef ac_int<6,          true>   int6;
+  typedef ac_int<6,          false>  uint6;
+  typedef ac_int<7,          true>   int7;
+  typedef ac_int<7,          false>  uint7;
+  typedef ac_int<8,          true>   int8;
+  typedef ac_int<8,          false>  uint8;
+  typedef ac_int<9,          true>   int9;
+  typedef ac_int<9,          false>  uint9;
+  typedef ac_int<10,         true>   int10;
+  typedef ac_int<10,         false>  uint10;
+  typedef ac_int<11,         true>   int11;
+  typedef ac_int<11,         false>  uint11;
+  typedef ac_int<12,         true>   int12;
+  typedef ac_int<12,         false>  uint12;
+  typedef ac_int<13,         true>   int13;
+  typedef ac_int<13,         false>  uint13;
+  typedef ac_int<14,         true>   int14;
+  typedef ac_int<14,         false>  uint14;
+  typedef ac_int<15,         true>   int15;
+  typedef ac_int<15,         false>  uint15;
+  typedef ac_int<16,         true>   int16;
+  typedef ac_int<16,         false>  uint16;
+  typedef ac_int<17,         true>   int17;
+  typedef ac_int<17,         false>  uint17;
+  typedef ac_int<18,         true>   int18;
+  typedef ac_int<18,         false>  uint18;
+  typedef ac_int<19,         true>   int19;
+  typedef ac_int<19,         false>  uint19;
+  typedef ac_int<20,         true>   int20;
+  typedef ac_int<20,         false>  uint20;
+  typedef ac_int<21,         true>   int21;
+  typedef ac_int<21,         false>  uint21;
+  typedef ac_int<22,         true>   int22;
+  typedef ac_int<22,         false>  uint22;
+  typedef ac_int<23,         true>   int23;
+  typedef ac_int<23,         false>  uint23;
+  typedef ac_int<24,         true>   int24;
+  typedef ac_int<24,         false>  uint24;
+  typedef ac_int<25,         true>   int25;
+  typedef ac_int<25,         false>  uint25;
+  typedef ac_int<26,         true>   int26;
+  typedef ac_int<26,         false>  uint26;
+  typedef ac_int<27,         true>   int27;
+  typedef ac_int<27,         false>  uint27;
+  typedef ac_int<28,         true>   int28;
+  typedef ac_int<28,         false>  uint28;
+  typedef ac_int<29,         true>   int29;
+  typedef ac_int<29,         false>  uint29;
+  typedef ac_int<30,         true>   int30;
+  typedef ac_int<30,         false>  uint30;
+  typedef ac_int<31,         true>   int31;
+  typedef ac_int<31,         false>  uint31;
+  typedef ac_int<32,         true>   int32;
+  typedef ac_int<32,         false>  uint32;
+  typedef ac_int<33,         true>   int33;
+  typedef ac_int<33,         false>  uint33;
+  typedef ac_int<34,         true>   int34;
+  typedef ac_int<34,         false>  uint34;
+  typedef ac_int<35,         true>   int35;
+  typedef ac_int<35,         false>  uint35;
+  typedef ac_int<36,         true>   int36;
+  typedef ac_int<36,         false>  uint36;
+  typedef ac_int<37,         true>   int37;
+  typedef ac_int<37,         false>  uint37;
+  typedef ac_int<38,         true>   int38;
+  typedef ac_int<38,         false>  uint38;
+  typedef ac_int<39,         true>   int39;
+  typedef ac_int<39,         false>  uint39;
+  typedef ac_int<40,         true>   int40;
+  typedef ac_int<40,         false>  uint40;
+  typedef ac_int<41,         true>   int41;
+  typedef ac_int<41,         false>  uint41;
+  typedef ac_int<42,         true>   int42;
+  typedef ac_int<42,         false>  uint42;
+  typedef ac_int<43,         true>   int43;
+  typedef ac_int<43,         false>  uint43;
+  typedef ac_int<44,         true>   int44;
+  typedef ac_int<44,         false>  uint44;
+  typedef ac_int<45,         true>   int45;
+  typedef ac_int<45,         false>  uint45;
+  typedef ac_int<46,         true>   int46;
+  typedef ac_int<46,         false>  uint46;
+  typedef ac_int<47,         true>   int47;
+  typedef ac_int<47,         false>  uint47;
+  typedef ac_int<48,         true>   int48;
+  typedef ac_int<48,         false>  uint48;
+  typedef ac_int<49,         true>   int49;
+  typedef ac_int<49,         false>  uint49;
+  typedef ac_int<50,         true>   int50;
+  typedef ac_int<50,         false>  uint50;
+  typedef ac_int<51,         true>   int51;
+  typedef ac_int<51,         false>  uint51;
+  typedef ac_int<52,         true>   int52;
+  typedef ac_int<52,         false>  uint52;
+  typedef ac_int<53,         true>   int53;
+  typedef ac_int<53,         false>  uint53;
+  typedef ac_int<54,         true>   int54;
+  typedef ac_int<54,         false>  uint54;
+  typedef ac_int<55,         true>   int55;
+  typedef ac_int<55,         false>  uint55;
+  typedef ac_int<56,         true>   int56;
+  typedef ac_int<56,         false>  uint56;
+  typedef ac_int<57,         true>   int57;
+  typedef ac_int<57,         false>  uint57;
+  typedef ac_int<58,         true>   int58;
+  typedef ac_int<58,         false>  uint58;
+  typedef ac_int<59,         true>   int59;
+  typedef ac_int<59,         false>  uint59;
+  typedef ac_int<60,         true>   int60;
+  typedef ac_int<60,         false>  uint60;
+  typedef ac_int<61,         true>   int61;
+  typedef ac_int<61,         false>  uint61;
+  typedef ac_int<62,         true>   int62;
+  typedef ac_int<62,         false>  uint62;
+  typedef ac_int<63,         true>   int63;
+  typedef ac_int<63,         false>  uint63;
+}  // namespace ac_intN
+
+#ifndef AC_NOT_USING_INTN
+using namespace ac_intN;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( disable: 4700 )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wuninitialized"
+#endif
+
+// Global templatized functions for easy initialization to special values
+template<ac_special_val V, int W, bool S>
+inline ac_int<W,S> value(ac_int<W,S>) {
+  ac_int<W,S> r;
+  return r.template set_val<V>();
+}
+// forward declaration, otherwise GCC errors when calling init_array
+template<ac_special_val V, int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline ac_fixed<W,I,S,Q,O> value(ac_fixed<W,I,S,Q,O>);
+
+#define SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
+template<> inline C_TYPE value<AC_VAL_DC>(C_TYPE) { C_TYPE x; return x; }
+
+// -- C int types -----------------------------------------------------------------
+#define SPECIAL_VAL_FOR_INTS(C_TYPE, WI, SI) \
+template<ac_special_val val> inline C_TYPE value(C_TYPE); \
+template<> inline C_TYPE value<AC_VAL_0>(C_TYPE) { return (C_TYPE)0; } \
+SPECIAL_VAL_FOR_INTS_DC(C_TYPE, WI, SI) \
+template<> inline C_TYPE value<AC_VAL_QUANTUM>(C_TYPE) { return (C_TYPE)1; } \
+template<> inline C_TYPE value<AC_VAL_MAX>(C_TYPE) { return (C_TYPE)(SI ? ~(((C_TYPE) 1) << (WI-1)) : (C_TYPE) -1); } \
+template<> inline C_TYPE value<AC_VAL_MIN>(C_TYPE) { return (C_TYPE)(SI ? ((C_TYPE) 1) << (WI-1) : (C_TYPE) 0); }
+
+SPECIAL_VAL_FOR_INTS(bool, 1, false)
+SPECIAL_VAL_FOR_INTS(char, 8, true)
+SPECIAL_VAL_FOR_INTS(signed char, 8, true)
+SPECIAL_VAL_FOR_INTS(unsigned char, 8, false)
+SPECIAL_VAL_FOR_INTS(short, 16, true)
+SPECIAL_VAL_FOR_INTS(unsigned short, 16, false)
+SPECIAL_VAL_FOR_INTS(int, 32, true)
+SPECIAL_VAL_FOR_INTS(unsigned int, 32, false)
+SPECIAL_VAL_FOR_INTS(long, ac_private::long_w, true)
+SPECIAL_VAL_FOR_INTS(unsigned long, ac_private::long_w, false)
+SPECIAL_VAL_FOR_INTS(Slong, 64, true)
+SPECIAL_VAL_FOR_INTS(Ulong, 64, false)
+
+#define INIT_ARRAY_SPECIAL_VAL_FOR_INTS(C_TYPE) \
+  template<ac_special_val V> \
+  inline bool init_array(C_TYPE *a, int n) { \
+    C_TYPE t = value<V>((C_TYPE) 0); \
+    for(int i=0; i < n; i++) \
+      a[i] = t; \
+    return true; \
+  }
+
+namespace ac {
+// PUBLIC FUNCTIONS
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, int W, bool S>
+  inline bool init_array(ac_int<W,S> *a, int n) {
+    ac_int<W,S> t;
+    t.template set_val<V>();
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(bool)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(char)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed char)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned char)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed short)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned short)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed int)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned int)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(signed long long)
+  INIT_ARRAY_SPECIAL_VAL_FOR_INTS(unsigned long long)
+}
+
+#if (defined(_MSC_VER) && !defined(__EDG__))
+#pragma warning( pop )
+#endif
+#if (defined(__GNUC__) && ( __GNUC__ == 4 && __GNUC_MINOR__ >= 6 || __GNUC__ > 4 ) && !defined(__EDG__))
+#pragma GCC diagnostic pop
+#endif
+#if defined(__clang__)
+#pragma clang diagnostic pop
+#endif
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#endif // __AC_INT_H
diff --git a/hls4ml/templates/quartus/ac_types/ac_sc.h b/hls4ml/templates/quartus/ac_types/ac_sc.h
index 0921471dc9..01601a5a4a 100644
--- a/hls4ml/templates/quartus/ac_types/ac_sc.h
+++ b/hls4ml/templates/quartus/ac_types/ac_sc.h
@@ -1,552 +1,552 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2004-2019, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-#ifndef __AC_SC_H
-#define __AC_SC_H
-
-#ifndef __cplusplus
-#error C++ is required to include this header file
-#endif
-
-#if !defined(IEEE_1666_SYSTEMC) && !defined(SYSTEMC_VERSION) && !defined(SC_API_VERSION_STRING)
-#error SystemC header file needs to be included before the ac_sc is included
-#endif
-
-#include <ac_complex.h>
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-// Explicit conversion functions from ac to sc and viceversa
-template <int W>
-ac_int<W, true> to_ac(const sc_dt::sc_bigint<W> &val){
-  enum {N = (W+31)/32 };
-  sc_dt::sc_bigint<N*32> v = val;
-  ac_int<N*32, true> r = 0;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = 0; i < N; i++) {
-    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
-    v >>= 32;
-  }
-  return ac_int<W,true>(r);
-}
-
-template <int W>
-ac_int<W, false> to_ac(const sc_dt::sc_biguint<W> &val){
-  enum {N = (W+31)/32 };
-  sc_dt::sc_biguint<N*32> v = val;
-  ac_int<N*32, true> r = 0;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = 0; i < N; i++) {
-    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
-    v >>= 32;
-  }
-  return ac_int<W,false>(r);
-}
-
-template <int W>
-sc_dt::sc_bigint<W> to_sc(const ac_int<W,true> &val) {
-  enum {N = (W+31)/32 };
-  ac_int<N*32, true> v = val;
-  sc_dt::sc_bigint<N*32> r;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = N-1; i >= 0; i--) {
-    r <<= 32;
-    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
-  }
-  return sc_dt::sc_bigint<W>(r);
-}
-
-template <int W>
-sc_dt::sc_biguint<W> to_sc(const ac_int<W,false> &val) {
-  enum {N = (W+31)/32 };
-  ac_int<N*32, true> v = val;
-  sc_dt::sc_biguint<N*32> r;
-#ifdef __SYNTHESIS__
-#pragma UNROLL y
-#endif
-  for(int i = N-1; i >= 0; i--) {
-    r <<= 32;
-    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
-  }
-  return sc_dt::sc_biguint<W>(r);
-}
-
-#ifdef SC_INCLUDE_FX
-template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-ac_fixed<W,I, true> to_ac(const sc_dt::sc_fixed<W,I,Q,O,nbits> &val){
-  ac_fixed<W,I,true> r = 0;
-  sc_dt::sc_fixed<W,W> fv;
-  fv.range(W-1,0) = val.range(W-1,0);
-  sc_dt::sc_bigint<W> v(fv);
-  r.set_slc(0, to_ac(v));
-  return r;
-}
-
-template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-ac_fixed<W,I, false> to_ac(const sc_dt::sc_ufixed<W,I,Q,O,nbits> &val){
-  ac_fixed<W,I,false> r = 0;
-  sc_dt::sc_ufixed<W,W> fv;
-  fv.range(W-1,0) = val.range(W-1,0);
-  sc_dt::sc_biguint<W> v(fv);
-  r.set_slc(0, to_ac(v));
-  return r;
-}
-
-template <int W, int I, ac_q_mode Q, ac_o_mode O>
-sc_dt::sc_fixed<W,I> to_sc(const ac_fixed<W,I,true,Q,O> &val) {
-  ac_int<W,true> v = val.template slc<W>(0);
-  sc_dt::sc_bigint<W> i = to_sc(v);
-  sc_dt::sc_fixed<W,W> f(i);
-  sc_dt::sc_fixed<W,I> r;
-  r.range(W-1,0) = f.range(W-1,0);
-  return r;
-}
-
-template <int W, int I, ac_q_mode Q, ac_o_mode O>
-sc_dt::sc_ufixed<W,I> to_sc(const ac_fixed<W,I,false,Q,O> &val) {
-  ac_int<W,false> v = val.template slc<W>(0);
-  sc_dt::sc_biguint<W> i = to_sc(v);
-  sc_dt::sc_ufixed<W,W> f(i);
-  sc_dt::sc_ufixed<W,I> r;
-  r.range(W-1,0) = f.range(W-1,0);
-  return r;
-}
-#endif
-
-// Utility global functions for initialization
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_int<W> value(sc_dt::sc_int<W>) {
-  sc_dt::sc_int<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_MIN)
-      r[W-1] = 1;
-    else if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX) {
-    r = -1;
-    r[W-1] = 0;
-  }
-  return r;
-}
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_uint<W> value(sc_dt::sc_uint<W>) {
-  sc_dt::sc_uint<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX)
-    r = -1;
-  return r;
-}
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_bigint<W> value(sc_dt::sc_bigint<W>) {
-  sc_dt::sc_bigint<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_MIN)
-      r[W-1] = 1;
-    else if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX) {
-    r = -1;
-    r[W-1] = 0;
-  }
-  return r;
-}
-
-template<ac_special_val V, int W>
-inline sc_dt::sc_biguint<W> value(sc_dt::sc_biguint<W>) {
-  sc_dt::sc_biguint<W> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX)
-    r = -1;
-  return r;
-}
-
-#ifdef SC_INCLUDE_FX
-template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-inline sc_dt::sc_fixed<W,I,Q,O,nbits> value(sc_dt::sc_fixed<W,I,Q,O,nbits>) {
-  sc_dt::sc_fixed<W,I> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_MIN)
-      r[W-1] = 1;
-    else if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX) {
-    r = ~ (sc_dt::sc_fixed<W,I>) 0;
-    r[W-1] = 0;
-  }
-  return r;
-}
-
-template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-inline sc_dt::sc_ufixed<W,I,Q,O,nbits> value(sc_dt::sc_ufixed<W,I,Q,O,nbits>) {
-  sc_dt::sc_ufixed<W,I> r;
-  if(V == AC_VAL_DC) {
-    int t;
-    r = t;
-  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
-    r = 0;
-    if(V == AC_VAL_QUANTUM)
-      r[0] = 1;
-  } else if(AC_VAL_MAX)
-    r = ~ (sc_dt::sc_ufixed<W,I>) 0;
-  return r;
-}
-#endif
-
-
-namespace ac {
-// PUBLIC FUNCTIONS
-// function to initialize (or uninitialize) arrays
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_int<W> *a, int n) {
-    sc_dt::sc_int<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_uint<W> *a, int n) {
-    sc_dt::sc_uint<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_bigint<W> *a, int n) {
-    sc_dt::sc_bigint<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W>
-  inline bool init_array(sc_dt::sc_biguint<W> *a, int n) {
-    sc_dt::sc_biguint<W> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-#ifdef SC_INCLUDE_FX
-  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-  inline bool init_array(sc_dt::sc_fixed<W,I,Q,O,nbits> *a, int n) {
-    sc_dt::sc_fixed<W,I> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
-  inline bool init_array(sc_dt::sc_ufixed<W,I,Q,O,nbits> *a, int n) {
-    sc_dt::sc_ufixed<W,I> t = value<V>(*a);
-    for(int i=0; i < n; i++)
-      a[i] = t;
-    return true;
-  }
-#endif
-}
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-
-// TRACE FUNCTIONS
-
-// SystemC Versions - 2.2.0 20070314
-//                    2.3.0 20120701
-//                    2.3.1 20140417
-//                    2.3.2 20171012
-
-#if !defined(NCSC)
-#if (SYSTEMC_VERSION >= 20140417) && !defined(SC_TRACE_FILE_BASE_H_INCLUDED_)
-namespace sc_core {
-class vcd_trace;
-class sc_trace_file_base
-  : public sc_trace_file
-{
-public:
-    enum vcd_enum {VCD_WIRE=0, VCD_REAL, VCD_EVENT, VCD_TIME, VCD_LAST};
-    virtual void do_initialize() = 0;
-    FILE* fp;
-#if (SYSTEMC_VERSION >= 20171012)
-    sc_time::value_type trace_unit_fs, kernel_unit_fs;
-#else
-    double timescale_unit;
-#endif
-    bool        timescale_set_by_user;
-    std::string filename_;
-    bool        initialized_;
-    bool        trace_delta_cycles_;
-    virtual ~sc_trace_file_base();
-};
-class vcd_trace_file
-  : public sc_trace_file_base
-{
-public:
-    ~vcd_trace_file();
-    std::string obtain_name();
-    virtual void do_initialize();
-    unsigned vcd_name_index;
-#if (SYSTEMC_VERSION >= 20171012)
-    sc_time::value_type previous_time_units_low, previous_time_units_high;
-#else
-    unsigned previous_time_units_low, previous_time_units_high;
-#endif
-    std::vector<vcd_trace*> traces;
-};
-}
-#endif
-
-namespace sc_core {
-//==============================================================================
-// The following block of code is copied from the file sc_vcd_trace.cpp in the
-// SystemC distribution. This code should have been placed in the file
-// sc_vcd_trace.h to allow proper C++ derivation.
-class vcd_trace
-{
-public:
-    vcd_trace(const std::string& name_, const std::string& vcd_name_);
-    virtual void write(FILE* f) = 0;
-    virtual void set_width();
-    virtual bool changed() = 0;
-#if (SYSTEMC_VERSION >= 20171012)
-    virtual void print_variable_declaration_line(FILE* f, const char* scoped_name);
-#else
-    virtual void print_variable_declaration_line(FILE* f);
-#endif
-    void compose_data_line(char* rawdata, char* compdata);
-
-#if (SYSTEMC_VERSION >= 20140417)
-    std::string compose_line(const std::string& data);
-#else
-    std::string compose_line(const std::string data);
-#endif
-    virtual ~vcd_trace();
-    const std::string name;
-    const std::string vcd_name;
-#if (SYSTEMC_VERSION >= 20171012)
-    vcd_trace_file::vcd_enum vcd_var_type;
-#else
-    const char* vcd_var_typ_name;
-#endif
-    int bit_width;
-};
-}
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-namespace ac_tracing {
-
-//==============================================================================
-// TRACING SUPPORT FOR AC_INT
-template <int W, bool S>
-class vcd_ac_int_trace : public sc_core::vcd_trace
-{
-public:
-  vcd_ac_int_trace(const ac_int<W,S> &object_, const std::string& name_, const std::string& vcd_name_) :
-    vcd_trace(name_, vcd_name_), object(object_)
-  {
-#if (SYSTEMC_VERSION >= 20171012)
-    vcd_var_type = sc_core::vcd_trace_file::VCD_WIRE;
-#else
-    vcd_var_typ_name = "wire"; // SystemC does not expose vcd_types[] in sc_vcd_trace.h
-#endif
-    bit_width = W; // bit_width defined in base class 'vcd_trace'
-  }
-
-  virtual void write(FILE* f) {
-    // The function to_string(AC_BIN) returns a string with the zero-radix prefix (i.e. "0b").
-    // Strip that prefix off because compose_line will add its own.
-    std::fprintf(f, "%s", compose_line(((ac_int<W,false>)object).to_string(AC_BIN,true).substr(3)).c_str());
-    old_value = object;
-  }
-
-  virtual void set_width() { bit_width = W; }
-
-  // Comparison function needs to be pure virtual too
-  virtual bool changed() { return !(object == old_value); }
-
-  virtual ~vcd_ac_int_trace() {}
-protected:
-  const ac_int<W,S> &object;
-  ac_int<W,S>        old_value;
-};
-
-template <int W, bool S>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_int<W,S> &a, const std::string &name)
-{
-  using namespace sc_core;
-  if (tf) {
-    vcd_trace *t = (vcd_trace*) new vcd_ac_int_trace<W,S>(a,name,((vcd_trace_file*)tf)->obtain_name());
-    ((vcd_trace_file*)tf)->traces.push_back(t);
-  }
-}
-//==============================================================================
-
-#if !defined(__AC_FIXED_MTI_H)
-// The ac_fixed.h shipped with ModelSim/QuestaSim has a stub for sc_trace() for ac_fixed so
-// this code is not used. The stub should be removed in a future release of the simulator.
-#if defined(__AC_FIXED_H) && !defined(SC_TRACE_AC_FIXED)
-#define SC_TRACE_AC_FIXED
-//==============================================================================
-// TRACING SUPPORT FOR AC_FIXED
-template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_fixed<W,I,S,Q,O> &a, const std::string &name)
-{
-  const int iv_N = (W+31+!S)/32;
-  typedef typename ac_private::template iv<iv_N> CommonBase_t;
-  sc_trace(tf, *(const ac_int<W,S>*)(const CommonBase_t*) &a, name);
-}
-//==============================================================================
-#endif
-#endif
-
-#if defined(__AC_FLOAT_H) && !defined(SC_TRACE_AC_FLOAT)
-#define SC_TRACE_AC_FLOAT
-//==============================================================================
-// TRACING SUPPORT FOR AC_FLOAT
-template<int W, int I, int E, ac_q_mode Q>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_float<W,I,E,Q> &a, const std::string &name)
-{
-  sc_trace(tf, a.m, name + ".m");
-  sc_trace(tf, a.e, name + ".e");
-}
-//==============================================================================
-#endif
-
-#if defined(__AC_STD_FLOAT_H) && !defined(SC_TRACE_AC_STD_FLOAT)
-#define SC_TRACE_AC_STD_FLOAT
-//==============================================================================
-// TRACING SUPPORT FOR AC_STD_FLOAT
-template<int W, int E>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_std_float<W,E> &a, const std::string &name)
-{
-  sc_trace(tf, a.data(), name + ".d");
-}
-//==============================================================================
-//==============================================================================
-// TRACING SUPPORT FOR AC_IEEE_FLOAT
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary16> &a, const std::string &name)
-{
-  sc_trace(tf, a.data(), name + ".d");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary32> &a, const std::string &name)
-{
-  sc_trace(tf, *(const int*) &a.data(), name + ".d");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary64> &a, const std::string &name)
-{
-  sc_trace(tf, *(const long long*) &a.data(), name + ".d");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary128> &a, const std::string &name)
-{
-  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
-  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
-}
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary256> &a, const std::string &name)
-{
-  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
-  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
-  sc_trace(tf, ((const long long*) &a.data())[2], name + ".d2");
-  sc_trace(tf, ((const long long*) &a.data())[3], name + ".d3");
-}
-// TRACING SUPPORT FOR AC::BFLOAT16
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac::bfloat16 &a, const std::string &name)
-{
-  sc_trace(tf, a.data(), name + ".d");
-}
-//==============================================================================
-#endif
-
-#if defined(__AC_COMPLEX_H) && !defined(SC_TRACE_AC_COMPLEX)
-#define SC_TRACE_AC_COMPLEX
-//==============================================================================
-// TRACING SUPPORT FOR AC_COMPLEX
-template<typename T>
-inline void sc_trace(sc_core::sc_trace_file *tf, const ac_complex<T> &a, const std::string &name)
-{
-  sc_trace(tf, a.real(), name + ".r");
-  sc_trace(tf, a.imag(), name + ".i");
-}
-#endif
-
-}  // namespace ac_tracing
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-namespace sc_core {
-#ifdef __AC_NAMESPACE
-  using __AC_NAMESPACE::ac_tracing::sc_trace;
-#else
-  using ac_tracing::sc_trace;
-#endif
-}
-
-#endif
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2004-2019, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+#ifndef __AC_SC_H
+#define __AC_SC_H
+
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#endif
+
+#if !defined(IEEE_1666_SYSTEMC) && !defined(SYSTEMC_VERSION) && !defined(SC_API_VERSION_STRING)
+#error SystemC header file needs to be included before the ac_sc is included
+#endif
+
+#include <ac_complex.h>
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+// Explicit conversion functions from ac to sc and viceversa
+template <int W>
+ac_int<W, true> to_ac(const sc_dt::sc_bigint<W> &val){
+  enum {N = (W+31)/32 };
+  sc_dt::sc_bigint<N*32> v = val;
+  ac_int<N*32, true> r = 0;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = 0; i < N; i++) {
+    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
+    v >>= 32;
+  }
+  return ac_int<W,true>(r);
+}
+
+template <int W>
+ac_int<W, false> to_ac(const sc_dt::sc_biguint<W> &val){
+  enum {N = (W+31)/32 };
+  sc_dt::sc_biguint<N*32> v = val;
+  ac_int<N*32, true> r = 0;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = 0; i < N; i++) {
+    r.set_slc(i*32, ac_int<32,true>(v.to_int()));
+    v >>= 32;
+  }
+  return ac_int<W,false>(r);
+}
+
+template <int W>
+sc_dt::sc_bigint<W> to_sc(const ac_int<W,true> &val) {
+  enum {N = (W+31)/32 };
+  ac_int<N*32, true> v = val;
+  sc_dt::sc_bigint<N*32> r;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = N-1; i >= 0; i--) {
+    r <<= 32;
+    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
+  }
+  return sc_dt::sc_bigint<W>(r);
+}
+
+template <int W>
+sc_dt::sc_biguint<W> to_sc(const ac_int<W,false> &val) {
+  enum {N = (W+31)/32 };
+  ac_int<N*32, true> v = val;
+  sc_dt::sc_biguint<N*32> r;
+#ifdef __SYNTHESIS__
+#pragma UNROLL y
+#endif
+  for(int i = N-1; i >= 0; i--) {
+    r <<= 32;
+    r.range(31, 0) = (v.template slc<32>(i*32)).to_int();
+  }
+  return sc_dt::sc_biguint<W>(r);
+}
+
+#ifdef SC_INCLUDE_FX
+template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+ac_fixed<W,I, true> to_ac(const sc_dt::sc_fixed<W,I,Q,O,nbits> &val){
+  ac_fixed<W,I,true> r = 0;
+  sc_dt::sc_fixed<W,W> fv;
+  fv.range(W-1,0) = val.range(W-1,0);
+  sc_dt::sc_bigint<W> v(fv);
+  r.set_slc(0, to_ac(v));
+  return r;
+}
+
+template <int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+ac_fixed<W,I, false> to_ac(const sc_dt::sc_ufixed<W,I,Q,O,nbits> &val){
+  ac_fixed<W,I,false> r = 0;
+  sc_dt::sc_ufixed<W,W> fv;
+  fv.range(W-1,0) = val.range(W-1,0);
+  sc_dt::sc_biguint<W> v(fv);
+  r.set_slc(0, to_ac(v));
+  return r;
+}
+
+template <int W, int I, ac_q_mode Q, ac_o_mode O>
+sc_dt::sc_fixed<W,I> to_sc(const ac_fixed<W,I,true,Q,O> &val) {
+  ac_int<W,true> v = val.template slc<W>(0);
+  sc_dt::sc_bigint<W> i = to_sc(v);
+  sc_dt::sc_fixed<W,W> f(i);
+  sc_dt::sc_fixed<W,I> r;
+  r.range(W-1,0) = f.range(W-1,0);
+  return r;
+}
+
+template <int W, int I, ac_q_mode Q, ac_o_mode O>
+sc_dt::sc_ufixed<W,I> to_sc(const ac_fixed<W,I,false,Q,O> &val) {
+  ac_int<W,false> v = val.template slc<W>(0);
+  sc_dt::sc_biguint<W> i = to_sc(v);
+  sc_dt::sc_ufixed<W,W> f(i);
+  sc_dt::sc_ufixed<W,I> r;
+  r.range(W-1,0) = f.range(W-1,0);
+  return r;
+}
+#endif
+
+// Utility global functions for initialization
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_int<W> value(sc_dt::sc_int<W>) {
+  sc_dt::sc_int<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_MIN)
+      r[W-1] = 1;
+    else if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX) {
+    r = -1;
+    r[W-1] = 0;
+  }
+  return r;
+}
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_uint<W> value(sc_dt::sc_uint<W>) {
+  sc_dt::sc_uint<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX)
+    r = -1;
+  return r;
+}
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_bigint<W> value(sc_dt::sc_bigint<W>) {
+  sc_dt::sc_bigint<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_MIN)
+      r[W-1] = 1;
+    else if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX) {
+    r = -1;
+    r[W-1] = 0;
+  }
+  return r;
+}
+
+template<ac_special_val V, int W>
+inline sc_dt::sc_biguint<W> value(sc_dt::sc_biguint<W>) {
+  sc_dt::sc_biguint<W> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX)
+    r = -1;
+  return r;
+}
+
+#ifdef SC_INCLUDE_FX
+template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+inline sc_dt::sc_fixed<W,I,Q,O,nbits> value(sc_dt::sc_fixed<W,I,Q,O,nbits>) {
+  sc_dt::sc_fixed<W,I> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_MIN)
+      r[W-1] = 1;
+    else if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX) {
+    r = ~ (sc_dt::sc_fixed<W,I>) 0;
+    r[W-1] = 0;
+  }
+  return r;
+}
+
+template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+inline sc_dt::sc_ufixed<W,I,Q,O,nbits> value(sc_dt::sc_ufixed<W,I,Q,O,nbits>) {
+  sc_dt::sc_ufixed<W,I> r;
+  if(V == AC_VAL_DC) {
+    int t;
+    r = t;
+  } else if(V == AC_VAL_0 || V == AC_VAL_MIN || V == AC_VAL_QUANTUM) {
+    r = 0;
+    if(V == AC_VAL_QUANTUM)
+      r[0] = 1;
+  } else if(AC_VAL_MAX)
+    r = ~ (sc_dt::sc_ufixed<W,I>) 0;
+  return r;
+}
+#endif
+
+
+namespace ac {
+// PUBLIC FUNCTIONS
+// function to initialize (or uninitialize) arrays
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_int<W> *a, int n) {
+    sc_dt::sc_int<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_uint<W> *a, int n) {
+    sc_dt::sc_uint<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_bigint<W> *a, int n) {
+    sc_dt::sc_bigint<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W>
+  inline bool init_array(sc_dt::sc_biguint<W> *a, int n) {
+    sc_dt::sc_biguint<W> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+#ifdef SC_INCLUDE_FX
+  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+  inline bool init_array(sc_dt::sc_fixed<W,I,Q,O,nbits> *a, int n) {
+    sc_dt::sc_fixed<W,I> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+  template<ac_special_val V, int W, int I, sc_dt::sc_q_mode Q, sc_dt::sc_o_mode O, int nbits>
+  inline bool init_array(sc_dt::sc_ufixed<W,I,Q,O,nbits> *a, int n) {
+    sc_dt::sc_ufixed<W,I> t = value<V>(*a);
+    for(int i=0; i < n; i++)
+      a[i] = t;
+    return true;
+  }
+#endif
+}
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+
+// TRACE FUNCTIONS
+
+// SystemC Versions - 2.2.0 20070314
+//                    2.3.0 20120701
+//                    2.3.1 20140417
+//                    2.3.2 20171012
+
+#if !defined(NCSC)
+#if (SYSTEMC_VERSION >= 20140417) && !defined(SC_TRACE_FILE_BASE_H_INCLUDED_)
+namespace sc_core {
+class vcd_trace;
+class sc_trace_file_base
+  : public sc_trace_file
+{
+public:
+    enum vcd_enum {VCD_WIRE=0, VCD_REAL, VCD_EVENT, VCD_TIME, VCD_LAST};
+    virtual void do_initialize() = 0;
+    FILE* fp;
+#if (SYSTEMC_VERSION >= 20171012)
+    sc_time::value_type trace_unit_fs, kernel_unit_fs;
+#else
+    double timescale_unit;
+#endif
+    bool        timescale_set_by_user;
+    std::string filename_;
+    bool        initialized_;
+    bool        trace_delta_cycles_;
+    virtual ~sc_trace_file_base();
+};
+class vcd_trace_file
+  : public sc_trace_file_base
+{
+public:
+    ~vcd_trace_file();
+    std::string obtain_name();
+    virtual void do_initialize();
+    unsigned vcd_name_index;
+#if (SYSTEMC_VERSION >= 20171012)
+    sc_time::value_type previous_time_units_low, previous_time_units_high;
+#else
+    unsigned previous_time_units_low, previous_time_units_high;
+#endif
+    std::vector<vcd_trace*> traces;
+};
+}
+#endif
+
+namespace sc_core {
+//==============================================================================
+// The following block of code is copied from the file sc_vcd_trace.cpp in the
+// SystemC distribution. This code should have been placed in the file
+// sc_vcd_trace.h to allow proper C++ derivation.
+class vcd_trace
+{
+public:
+    vcd_trace(const std::string& name_, const std::string& vcd_name_);
+    virtual void write(FILE* f) = 0;
+    virtual void set_width();
+    virtual bool changed() = 0;
+#if (SYSTEMC_VERSION >= 20171012)
+    virtual void print_variable_declaration_line(FILE* f, const char* scoped_name);
+#else
+    virtual void print_variable_declaration_line(FILE* f);
+#endif
+    void compose_data_line(char* rawdata, char* compdata);
+
+#if (SYSTEMC_VERSION >= 20140417)
+    std::string compose_line(const std::string& data);
+#else
+    std::string compose_line(const std::string data);
+#endif
+    virtual ~vcd_trace();
+    const std::string name;
+    const std::string vcd_name;
+#if (SYSTEMC_VERSION >= 20171012)
+    vcd_trace_file::vcd_enum vcd_var_type;
+#else
+    const char* vcd_var_typ_name;
+#endif
+    int bit_width;
+};
+}
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+namespace ac_tracing {
+
+//==============================================================================
+// TRACING SUPPORT FOR AC_INT
+template <int W, bool S>
+class vcd_ac_int_trace : public sc_core::vcd_trace
+{
+public:
+  vcd_ac_int_trace(const ac_int<W,S> &object_, const std::string& name_, const std::string& vcd_name_) :
+    vcd_trace(name_, vcd_name_), object(object_)
+  {
+#if (SYSTEMC_VERSION >= 20171012)
+    vcd_var_type = sc_core::vcd_trace_file::VCD_WIRE;
+#else
+    vcd_var_typ_name = "wire"; // SystemC does not expose vcd_types[] in sc_vcd_trace.h
+#endif
+    bit_width = W; // bit_width defined in base class 'vcd_trace'
+  }
+
+  virtual void write(FILE* f) {
+    // The function to_string(AC_BIN) returns a string with the zero-radix prefix (i.e. "0b").
+    // Strip that prefix off because compose_line will add its own.
+    std::fprintf(f, "%s", compose_line(((ac_int<W,false>)object).to_string(AC_BIN,true).substr(3)).c_str());
+    old_value = object;
+  }
+
+  virtual void set_width() { bit_width = W; }
+
+  // Comparison function needs to be pure virtual too
+  virtual bool changed() { return !(object == old_value); }
+
+  virtual ~vcd_ac_int_trace() {}
+protected:
+  const ac_int<W,S> &object;
+  ac_int<W,S>        old_value;
+};
+
+template <int W, bool S>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_int<W,S> &a, const std::string &name)
+{
+  using namespace sc_core;
+  if (tf) {
+    vcd_trace *t = (vcd_trace*) new vcd_ac_int_trace<W,S>(a,name,((vcd_trace_file*)tf)->obtain_name());
+    ((vcd_trace_file*)tf)->traces.push_back(t);
+  }
+}
+//==============================================================================
+
+#if !defined(__AC_FIXED_MTI_H)
+// The ac_fixed.h shipped with ModelSim/QuestaSim has a stub for sc_trace() for ac_fixed so
+// this code is not used. The stub should be removed in a future release of the simulator.
+#if defined(__AC_FIXED_H) && !defined(SC_TRACE_AC_FIXED)
+#define SC_TRACE_AC_FIXED
+//==============================================================================
+// TRACING SUPPORT FOR AC_FIXED
+template<int W, int I, bool S, ac_q_mode Q, ac_o_mode O>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_fixed<W,I,S,Q,O> &a, const std::string &name)
+{
+  const int iv_N = (W+31+!S)/32;
+  typedef typename ac_private::template iv<iv_N> CommonBase_t;
+  sc_trace(tf, *(const ac_int<W,S>*)(const CommonBase_t*) &a, name);
+}
+//==============================================================================
+#endif
+#endif
+
+#if defined(__AC_FLOAT_H) && !defined(SC_TRACE_AC_FLOAT)
+#define SC_TRACE_AC_FLOAT
+//==============================================================================
+// TRACING SUPPORT FOR AC_FLOAT
+template<int W, int I, int E, ac_q_mode Q>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_float<W,I,E,Q> &a, const std::string &name)
+{
+  sc_trace(tf, a.m, name + ".m");
+  sc_trace(tf, a.e, name + ".e");
+}
+//==============================================================================
+#endif
+
+#if defined(__AC_STD_FLOAT_H) && !defined(SC_TRACE_AC_STD_FLOAT)
+#define SC_TRACE_AC_STD_FLOAT
+//==============================================================================
+// TRACING SUPPORT FOR AC_STD_FLOAT
+template<int W, int E>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_std_float<W,E> &a, const std::string &name)
+{
+  sc_trace(tf, a.data(), name + ".d");
+}
+//==============================================================================
+//==============================================================================
+// TRACING SUPPORT FOR AC_IEEE_FLOAT
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary16> &a, const std::string &name)
+{
+  sc_trace(tf, a.data(), name + ".d");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary32> &a, const std::string &name)
+{
+  sc_trace(tf, *(const int*) &a.data(), name + ".d");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary64> &a, const std::string &name)
+{
+  sc_trace(tf, *(const long long*) &a.data(), name + ".d");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary128> &a, const std::string &name)
+{
+  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
+  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
+}
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_ieee_float<binary256> &a, const std::string &name)
+{
+  sc_trace(tf, ((const long long*) &a.data())[0], name + ".d0");
+  sc_trace(tf, ((const long long*) &a.data())[1], name + ".d1");
+  sc_trace(tf, ((const long long*) &a.data())[2], name + ".d2");
+  sc_trace(tf, ((const long long*) &a.data())[3], name + ".d3");
+}
+// TRACING SUPPORT FOR AC::BFLOAT16
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac::bfloat16 &a, const std::string &name)
+{
+  sc_trace(tf, a.data(), name + ".d");
+}
+//==============================================================================
+#endif
+
+#if defined(__AC_COMPLEX_H) && !defined(SC_TRACE_AC_COMPLEX)
+#define SC_TRACE_AC_COMPLEX
+//==============================================================================
+// TRACING SUPPORT FOR AC_COMPLEX
+template<typename T>
+inline void sc_trace(sc_core::sc_trace_file *tf, const ac_complex<T> &a, const std::string &name)
+{
+  sc_trace(tf, a.real(), name + ".r");
+  sc_trace(tf, a.imag(), name + ".i");
+}
+#endif
+
+}  // namespace ac_tracing
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+namespace sc_core {
+#ifdef __AC_NAMESPACE
+  using __AC_NAMESPACE::ac_tracing::sc_trace;
+#else
+  using ac_tracing::sc_trace;
+#endif
+}
+
+#endif
diff --git a/hls4ml/templates/quartus/ac_types/ac_std_float.h b/hls4ml/templates/quartus/ac_types/ac_std_float.h
index 25ce8afc38..3b335b971b 100644
--- a/hls4ml/templates/quartus/ac_types/ac_std_float.h
+++ b/hls4ml/templates/quartus/ac_types/ac_std_float.h
@@ -1,2318 +1,2318 @@
-/**************************************************************************
- *                                                                        *
- *  Algorithmic C (tm) Datatypes                                          *
- *                                                                        *
- *  Software Version: 4.0                                                 *
- *                                                                        *
- *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
- *  Release Type    : Production Release                                  *
- *  Release Build   : 4.0.0                                               *
- *                                                                        *
- *  Copyright 2018-2020, Mentor Graphics Corporation,                     *
- *                                                                        *
- *  All Rights Reserved.                                                  *
- *                                                                        *
- **************************************************************************
- *  Licensed under the Apache License, Version 2.0 (the "License");       *
- *  you may not use this file except in compliance with the License.      *
- *  You may obtain a copy of the License at                               *
- *                                                                        *
- *      http://www.apache.org/licenses/LICENSE-2.0                        *
- *                                                                        *
- *  Unless required by applicable law or agreed to in writing, software   *
- *  distributed under the License is distributed on an "AS IS" BASIS,     *
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
- *  implied.                                                              *
- *  See the License for the specific language governing permissions and   *
- *  limitations under the License.                                        *
- **************************************************************************
- *                                                                        *
- *  The most recent version of this package is available at github.       *
- *                                                                        *
- *************************************************************************/
-
-/*  Source:         ac_std_float.h
- *  Description:    class for floating point operation handling in C++
- *  Author:         Andres Takach, Ph.D.
-
-Overview: this header defines three classes
-
-  ac_ieee_float<Format>
-    Meant to store floats in IEEE standard binary format
-    Format indicate width:
-      binary16: (half float) uses short
-      binary32: (float) uses int
-      binary64: (double) uses array of long long with one element
-      binary128: (long double in some platforms) uses array of long long with two elements
-      binary256: uses array of long long with four elements
-
-  ac::bfloat16
-    Implements Google's tensorflow::bfloat16
-    Stores data as "short"
-
-  ac_std_float<W,E>
-    Superset of ac_ieee_float in that any bit width and exponent width is
-      allowed
-    This is used by ac_ieee_float and ac::bfloat16
-
-    Uses an ac_int<W,true> that holds the bit pattern for a standard (IEEE) style binary
-    float:
-         1) sign-magnitude representation, sign is MSB
-         2) mantissa (significand) with implied bit for normal numbers
-         3) E is not restricted to IEEE widths, another class ac_ieee_float does that
-
-    Provides easy way to conver to/from the closest covering ac_float:
-      Constructor from ac_float
-        Most two negative exponents of ac_float are not representable: shift
-          significand futher to the right (for now no attempt to round)
-        Most negative mantissa of ac_float (in two's complement) when converted
-          to sign-magnitute requires a right shift (add +1 to exponent)
-          If exponent is already max, two alternatives:
-            - "saturate" (store most negative number)
-            - Store as -Inf  (currently this option not available)
-        Exponent is offset
-        Mantissa implied bit is removed from normal numbers
-
-      Explicit convertion to_ac_float
-        Ignores exceptions (Inf, NaN)
-        Does inverse as above to obtain ac_float
-*/
-
-#ifndef __AC_STD_FLOAT_H
-#define __AC_STD_FLOAT_H
-#include <ac_float.h>
-#include <cstring>
-// Inclusion of cmath undefs all macros such as signbit etc that some parsers may define for C
-#include <cmath>
-
-#ifdef __SYNTHESIS__
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-#undef AC_IEEE_FLOAT_USE_BUILTIN
-#endif
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-// For now make data members public since SCVerify needs it
-//#ifdef __AC_MAKE_PRIVATE_DATA_PUBLIC
-#if 1
-#define __AC_DATA_PRIVATE public:
-#else
-#define __AC_DATA_PRIVATE private:
-#endif
-
-namespace ac_private {
-  template<bool cond>
-  struct check_rounding { enum {Only_symmetrical_roundings_or_truncations_supported}; };
-  template<> struct check_rounding<false> {};
-
-  template<ac_q_mode Q>
-  void check_supported() {
-    // only symmetrical roundings supported
-    const bool supported = Q==AC_RND_CONV || Q==AC_TRN_ZERO || Q==AC_RND_INF || Q == AC_RND_CONV_ODD;
-#if __cplusplus > 199711L
-    static_assert(supported, "Only symmetrical roundings/truncations supported");
-#else
-    (void) check_rounding<supported>::Only_symmetrical_roundings_or_truncations_supported;
-#endif
-  }
-
-  template<bool cond>
-  struct check_rounding2 { enum {Only_round_to_even_supported_when_using_BUILTIN}; };
-  template<> struct check_rounding2<false> {};
-
-  template<ac_q_mode Q>
-  void check_supported2() {
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-    const bool supported = Q==AC_RND_CONV;
-#if __cplusplus > 199711L
-    static_assert(supported, "Only round to even supported");
-#else
-    (void) check_rounding2<supported>::Only_round_to_even_supported_when_using_BUILTIN;
-#endif
-#endif
-  }
-
-  template<typename T, typename T2>
-  struct rt_closed_T {
-  };
-  template<typename T>
-  struct rt_closed_T<T,T> {
-    typedef T type;
-  };
-}
-
-namespace ac {
-  #pragma hls_design ccore
-  #pragma hls_ccore_type sequential
-  template<int W>
-  void fx_div(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
-    ac_int<W+2,true> R = op1;
-    bool R_neg = false;
-    ac_int<W,false> D = op2;
-    ac_int<W+1,true> neg_D = -D;
-    ac_int<W+2,false> Q = 0;
-    for(int i=0; i < W+2; i++) {
-      // take MSB of N, shift it in from right to R
-      R += ( R_neg ? (ac_int<W+1,true>) D : neg_D );
-      Q = (Q << 1) | ((R >= 0) & 1);
-      R_neg = R[W];
-      R <<= 1;
-    }
-    quotient = Q;
-    exact = !R | R_neg & (R >> 1) == neg_D;
-  }
-
-  template<int W>
-  void fx_div_sim(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
-    // need to compute extra rnd bit,
-    //   +2 because we may need to shift left by 1 (mant divisor > mant dividend)
-    ac_int<2*W+1,false> op1_mi = op1;
-    op1_mi <<= W+1;
-    // +1 bit to compute rnd bit
-    quotient = (op1_mi / op2);
-    exact = !(op1_mi % op2);
-  }
-
-  #pragma hls_design ccore
-  #pragma hls_ccore_type sequential
-  template<int W, int WR>
-  bool fx_sqrt( ac_int<W,false> x, ac_int<WR,false> &sqrt) {
-    // x is ac_fixed<W,2,false>, sqrt is ac_fixed<WR,1,false>
-    const bool W_odd = W&1;
-    const int ZW = W + W_odd;  // make it even
-    ac_int<ZW,false> z = x;
-    z <<= W_odd;
-    // masks used only to hint synthesis on precision
-    ac_int<WR+2,false> mask_d = 0;
-    ac_int<WR+2,false> d = 0;
-    ac_int<WR,false> r = 0;
-    unsigned int z_shift = ZW-2;
-    for(int i = WR-1; i >= 0; i--) {
-      r <<= 1;
-      mask_d = (mask_d << 2) | 0x3;
-      d = (mask_d & (d << 2)) | ((z >> z_shift) & 0x3 );
-      ac_int<WR+2,false> t = d - (( ((ac_int<WR+1,false>)r) << 1) | 0x1);
-      if( !t[WR+1] ) {  // since t is unsigned, look at MSB
-        r |= 0x1;
-        d = mask_d & t;
-      }
-      z <<= 2;
-    }
-
-    bool rem = (d != 0) || ((z >> 2*W) != 0);
-    sqrt = r;
-    return rem;
-  }
-}
-
-#ifndef AC_STD_FLOAT_FX_DIV_OVERRIDE
-#ifdef __SYNTHESIS__
-#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div
-#else
-#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div_sim
-#endif
-#endif
-
-template<int W, int E> class ac_std_float;
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
-#define AC_STD_FLOAT_OVERRIDE_NS ::AC_STD_FLOAT_OVERRIDE_NAMESPACE::
-namespace AC_STD_FLOAT_OVERRIDE_NAMESPACE {
-#ifdef __AC_NAMESPACE
-  using __AC_NAMESPACE::ac_q_mode;
-  using __AC_NAMESPACE::ac_std_float;
-#endif
-#else
-#define AC_STD_FLOAT_OVERRIDE_NS
-#endif
-
-#ifdef AC_STD_FLOAT_ADD_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_ADD_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
-#endif
-
-#ifdef AC_STD_FLOAT_MULT_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_MULT_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
-#endif
-
-#ifdef AC_STD_FLOAT_DIV_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_DIV_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
-#endif
-
-#ifdef AC_STD_FLOAT_FMA_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_FMA_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2, const ac_std_float<W,E> &op3);
-#endif
-
-#ifdef AC_STD_FLOAT_SQRT_OVERRIDE
-template<ac_q_mode QR, bool No_SubNormals, int W, int E>
-ac_std_float<W,E> AC_STD_FLOAT_SQRT_OVERRIDE(const ac_std_float<W,E> &op);
-#endif
-
-#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
-}
-#endif
-
-#ifdef __AC_NAMESPACE
-namespace __AC_NAMESPACE {
-#endif
-
-namespace ac {
-  inline void copy_bits(float a, float *b) { *b = a; }
-  inline void copy_bits(double a, double *b) { *b = a; }
-
-  inline void copy_bits(short a, short *b) { *b = a; }
-  inline void copy_bits(const ac_int<16,true> &a, short *b) { *b = (short) a.to_int(); }
-  inline void copy_bits(short a, ac_int<16,true> *b) { *b = ac_int<16,true>(a); }
-  inline void copy_bits(int a, int *b) { *b = a; }
-  inline void copy_bits(const ac_int<32,true> &a, int *b) { *b = a.to_int(); }
-  inline void copy_bits(int a, ac_int<32,true> *b) { *b = ac_int<32,true>(a); }
-  inline void copy_bits(long long a, long long *b) { *b = a; }
-  inline void copy_bits(const ac_int<64,true> &a, long long *b) { *b = a.to_int64(); }
-  inline void copy_bits(long long a, ac_int<64,true> *b) { *b = ac_int<64,true>(a); }
-  inline void copy_bits(const long long a[2], long long (*b)[2]) {
-    (*b)[0] = a[0];
-    (*b)[1] = a[1];
-  }
-  inline void copy_bits(const ac_int<128,true> &a, long long (*b)[2]) {
-    (*b)[0] = a.to_int64();
-    (*b)[1] = a.slc<64>(64).to_int64();
-  }
-  inline void copy_bits(const long long a[2], ac_int<128,true> *b) {
-    *b = 0;
-    b->set_slc(0,ac_int<64,true>(a[0]));
-    b->set_slc(64,ac_int<64,true>(a[1]));
-  }
-  inline void copy_bits(const long long a[4], long long (*b)[4]) {
-    (*b)[0] = a[0];
-    (*b)[1] = a[1];
-    (*b)[2] = a[2];
-    (*b)[3] = a[3];
-  }
-  inline void copy_bits(const ac_int<256,true> &a, long long (*b)[4]) {
-    (*b)[0] = a.to_int64();
-    (*b)[1] = a.slc<64>(64).to_int64();
-    (*b)[2] = a.slc<64>(128).to_int64();
-    (*b)[3] = a.slc<64>(192).to_int64();
-  }
-  inline void copy_bits(const long long a[4], ac_int<256,true> *b) {
-    *b = 0;
-    b->set_slc(0,ac_int<64,true>(a[0]));
-    b->set_slc(64,ac_int<64,true>(a[1]));
-    b->set_slc(128,ac_int<64,true>(a[2]));
-    b->set_slc(192,ac_int<64,true>(a[3]));
-  }
-  inline void copy_bits(float f, int *x);
-  inline void copy_bits(double f, long long *x);
-  inline void copy_bits(int x, float *f);
-  inline void copy_bits(long long x, double *f);
-
-  inline void copy_bits(float f, ac_int<32,true> *x) {
-    int x_i;
-    copy_bits(f, &x_i);
-    *x = x_i;
-  }
-  inline void copy_bits(double f, ac_int<64,true> *x) {
-    long long x_i;
-    copy_bits(f, &x_i);
-    *x = x_i;
-  }
-  inline void copy_bits(const ac_int<32,true> &x, float *f) { copy_bits(x.to_int(), f); }
-  inline void copy_bits(const ac_int<64,true> &x, double *f) { copy_bits(x.to_int64(), f); }
-}
-
-enum ac_ieee_float_format { binary16, binary32, binary64, binary128, binary256};
-
-// Forward declarations for ac_ieee_float and bfloat16
-template<ac_ieee_float_format Format>
-class ac_ieee_float;
-namespace ac {
-  class bfloat16;
-}
-
-template<int W, int E>
-class ac_std_float {
-__AC_DATA_PRIVATE
-  ac_int<W,true> d;
-public:
-  static const int width = W;
-  static const int e_width = E;
-  static const int mant_bits = W - E - 1;
-  static const int exp_bias = (1 << (E-1)) - 1;
-  static const int min_exp = -exp_bias + 1;
-  static const int max_exp = exp_bias;
-  static const int mu_bits = mant_bits + 1;
-private:
-  typedef ac_int<mu_bits,false> mu_t;
-  typedef ac_int<mu_bits+1,false> mu1_t;
-  typedef ac_int<mu_bits+2,false> mu2_t;
-  typedef ac_int<mu_bits+1,true> m_t;   // mantissa in two's complement representation
-public:
-  typedef ac_int<E,true> e_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  static ac_std_float nan() {
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits-1, ac_int<e_width+1,true>(-1));
-    return r;
-  }
-  static ac_std_float inf() {
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits, ac_int<e_width,true>(-1));
-    return r;
-  }
-  static ac_std_float denorm_min() {   // smallest positive non zero value (subnorm if supported)
-    ac_std_float r;
-    r.d = 1;
-    return r;
-  }
-  static ac_std_float min() {   // smallest NORMAL positive non zero value
-    ac_std_float r;
-    r.d = 0;
-    r.d[width-1-e_width] = true;
-    return r;
-  }
-  static ac_std_float max() {   // largest pos finite value
-    ac_std_float r;
-    r.d = -1;
-    r.d[width-1] = false;
-    r.d[width-1-e_width] = false;
-    return r;
-  }
-  static ac_std_float epsilon() {
-    ac_int<e_width,true> exp = -mant_bits + exp_bias;
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits, exp);
-    return r;
-  }
-  ac_std_float() {}
-  ac_std_float(const ac_std_float &f) : d(f.d) {}
-  template<int WR, ac_q_mode QR>
-  ac_std_float<WR,E> convert() const {
-    ac_private::check_supported<QR>();
-    ac_std_float<WR,E> r;
-    if(W <= WR) {
-      r.d = 0;
-      r.d.set_slc(WR-W, d);
-    } else {
-      typedef ac_std_float<WR,E> r_t;
-      const int r_mant_bits = r_t::mant_bits;
-      const int r_mu_bits = r_t::mu_bits;
-      e_t f_e = d.template slc<E>(mant_bits);
-      bool f_normal = !!f_e;
-      mu_t mu = d;
-      mu[r_mant_bits] = f_normal;
-      ac_fixed<r_mu_bits+1,mu_bits+1,false,QR> r_rnd = mu;
-      bool rnd_ovf = r_rnd[r_mu_bits];
-      ac_int<r_mant_bits,false> m_r = r_rnd.template slc<r_mant_bits>(0);
-      e_t e_r = f_e + rnd_ovf;
-      r.d = m_r;
-      r.d.set_slc(r_mant_bits, e_r);
-      r.d[WR-1] = d[W-1];
-    }
-    return r;
-  }
-
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
-    static const bool rnd = QFX!=AC_TRN && QFX!=AC_TRN_ZERO;
-    static const bool need_rnd_bit = QFX != AC_TRN;
-    static const bool need_rem_bits = need_rnd_bit && QFX != AC_RND;
-    static const bool need_ovf = OFX != AC_WRAP;
-    static const int t_width = AC_MAX(mu_bits+1, WFX+!SFX) + need_rnd_bit + need_ovf;
-
-    bool f_sign, f_normal, f_zero, f_inf, f_nan;
-    mu_t f_mu;
-    e_t f_e;
-    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
-    if(map_inf) {
-      ac_fixed<WFX,IFX,SFX,QFX,OFX> rv;
-      if(f_sign)
-        rv.template set_val<AC_VAL_MIN>();
-      else
-        rv.template set_val<AC_VAL_MAX>();
-      return rv; 
-    }
-    AC_ASSERT(!f_inf && !f_nan, "Expects finite float (not Nan or Inf)");
-    m_t f_m = f_sign ? m_t(-f_mu) : m_t(f_mu);
-    typedef ac_int<t_width,true> t_t;
-    typedef ac_int<t_width+need_rem_bits,true> t2_t;
-    t_t t = f_m;
-    t <<= need_rnd_bit;
-    static const int lsb_src = -mant_bits;
-    static const int lsb_trg = IFX-WFX;
-    int rshift = lsb_trg - lsb_src - (int)f_e;
-
-    bool sticky_bit_rnd = false;
-    bool rshift_neg = rshift < 0;
-    if(need_rem_bits) {
-      t_t shifted_out_bits = t;
-      typedef ac_int< ac::template nbits< AC_MAX(lsb_trg - lsb_src - min_exp,1) >::val, false> shift_ut;
-      shifted_out_bits &= ~(t_t(0).bit_complement() << (shift_ut) rshift);
-      sticky_bit_rnd = !!shifted_out_bits & !rshift_neg;
-    }
-    bool ovf = false;
-    if(need_ovf) {
-      t_t shifted_out_bits = t < 0 ? t_t(~t) : t;
-      // shift right by -rshift + 1
-      //   +1 is OK since added extra MSB
-      typedef ac_int< ac::template nbits< AC_MAX(-(lsb_trg - lsb_src - max_exp + 1),1) >::val, false> shift_ut;
-      shifted_out_bits &= ~((t_t(0).bit_complement() >> 2) >> (shift_ut) ~rshift);
-      ovf = !!shifted_out_bits & rshift_neg;
-    }
-
-    t >>= rshift;
-
-    t[t_width-1] = t[t_width-1] ^ (ovf & (t[t_width-1] ^ f_sign));
-    t[t_width-2] = t[t_width-2] ^ (ovf & (t[t_width-2] ^ !f_sign));
-    t2_t t2 = t;
-    if(need_rem_bits) {
-      t2 <<= 1;
-      t2[0] = t2[0] | sticky_bit_rnd;
-    }
-
-    ac_fixed<WFX,WFX+need_rnd_bit+need_rem_bits,SFX,QFX,OFX> ri = t2;
-    ac_fixed<WFX,IFX,SFX,QFX,OFX> r = 0;
-    r.set_slc(0,ri.template slc<WFX>(0));
-    return r;
-  }
-
-  template<int W2>
-  explicit ac_std_float(const ac_std_float<W2,E> &f) {
-    *this = f.template convert<W,AC_RND_CONV>();
-  }
-  template<int WR, int ER, ac_q_mode QR>
-  ac_std_float<WR,ER> convert() const {
-    ac_private::check_supported<QR>();
-    typedef ac_std_float<WR,ER> r_t;
-    typedef typename r_t::e_t r_e_t;
-    int const r_mu_bits = r_t::mu_bits;
-    int const r_mant_bits = r_t::mant_bits;
-    int const r_min_exp = r_t::min_exp;
-    int const r_max_exp = r_t::max_exp;
-    int const r_exp_bias = r_t::exp_bias;
-    bool f_sign, f_normal, f_zero, f_inf, f_nan;
-    mu_t f_mu;
-    e_t f_e;
-    r_t r;
-    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
-    int exp = f_e;
-    ac_fixed<r_mu_bits+1, mu_bits+1,false,QR> r_rnd;
-    if(ER >= E) {
-      if(ER > E && !f_normal) {
-        int ls = f_mu.leading_sign();
-        int max_shift_left = f_e - r_min_exp + 1;
-        bool shift_exponent_limited = ls >= max_shift_left;
-        int shift_l = shift_exponent_limited ? max_shift_left : ls;
-        f_mu <<= shift_l;
-        exp -= shift_l;
-      }
-      r_rnd = f_mu;
-    } else {
-      int shift_r = r_min_exp - f_e;
-      typedef ac_fixed<r_mu_bits+1,mu_bits,false> t_t;
-      t_t r_t = f_mu;
-      bool sticky_bit = !!(f_mu & ~((~mu_t(0)) << mant_bits-r_mant_bits-1));
-      if(shift_r > 0) {
-        t_t shifted_out_bits = r_t;
-        shifted_out_bits &= ~((~t_t(0)) << shift_r);
-        sticky_bit |= !!shifted_out_bits;
-        r_t >>= shift_r;
-        exp += shift_r;
-      }
-      ac_fixed<r_mu_bits+2, mu_bits,false> r_t2 = r_t;
-      r_t2[0] = sticky_bit;
-      r_rnd = r_t2;
-    }
-    bool rnd_ovf = r_rnd[r_mu_bits];
-    ac_int<r_mant_bits,false> r_m = r_rnd.template slc<r_mant_bits>(0);
-    bool r_normal = r_rnd[r_mant_bits] | rnd_ovf;
-    exp += rnd_ovf;
-    bool exception = f_inf | f_nan | (exp > r_max_exp);
-    r_e_t r_e = exception ? -1 : (f_zero | !r_normal) ? 0 : exp + r_exp_bias;
-    if(exception) {
-      r_m = 0;
-      r_m[r_mant_bits-1] = f_nan;
-    }
-    r.d = r_m;
-    r.d.set_slc(r_mant_bits, r_e);
-    r.d[WR-1] = d[W-1];
-    return r;
-  }
-  template<int W2,int E2>
-  explicit ac_std_float(const ac_std_float<W2,E2> &f) {
-    *this = f.template convert<W,E,AC_RND_CONV>();
-  }
-  template<ac_ieee_float_format Format>
-  explicit ac_std_float(const ac_ieee_float<Format> &f);
-
-  explicit ac_std_float(const ac::bfloat16 &f);
-
-  template<ac_q_mode Q>
-  explicit ac_std_float(const ac_float<mu_bits+1,2,E,Q> &f) {
-    bool sign = f.mantissa() < 0;
-    m_t m_s = f.m.template slc<mu_bits+1>(0);
-    mu1_t m_u = sign ? (mu1_t) -m_s : (mu1_t) m_s;
-    bool most_neg_m = m_u[mu_bits];
-    bool is_max_exp = f.exp() == (1 << (E-1)) - 1;
-    ac_int<E,true> e = f.exp() + exp_bias + (most_neg_m & !is_max_exp);
-    mu_t m = m_u | ac_int<1,true>(most_neg_m & is_max_exp);
-    m[mant_bits] = m[mant_bits] | most_neg_m;
-    bool exp_dont_map = !e | e==-1;
-    m >>= !e;
-    m >>= 2*(e==-1);
-    // exp_dont_map guarantees subnornal => e = 0
-    e &= ac_int<1,true>(!exp_dont_map & !!m);
-    d = m.template slc<mant_bits>(0);
-    d.set_slc(mant_bits, e);
-    d[W-1] = sign;
-  }
-  template<ac_q_mode Q, int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  void assign_from(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
-    ac_private::check_supported<Q>();
-    bool sign = fx < 0.0;
-    ac_fixed<WFX+1,2,SFX> x = 0;
-    x.set_slc(0,fx.template slc<WFX+1>(0));
-    bool all_sign;
-    int ls = x.leading_sign(all_sign);
-    int max_shift_left = IFX-1 - min_exp + 1;
-    bool shift_exponent_limited = ls >= max_shift_left;
-    int shift_l = shift_exponent_limited ? max_shift_left : ls;
-    ac_fixed<WFX+1,2,false> x_u = sign ? (ac_fixed<WFX+1,2,false>) -x :  (ac_fixed<WFX+1,2,false>) x;
-    x_u <<= shift_l;
-    int exp = IFX-1;
-    exp -= shift_l;
-    ac_fixed<mu_bits+1,2,false,Q> m_rnd = x_u;
-    mu1_t m_u = 0;  m_u.set_slc(0, m_rnd.template slc<mu_bits+1>(0));
-    bool shiftr1 = m_u[mu_bits];  // msb
-    bool r_normal = m_u[mu_bits] | m_u[mu_bits-1];
-    m_u >>= shiftr1;
-    exp += shiftr1;
-    bool fx_zero = all_sign & !sign;
-    bool r_inf = (exp > max_exp) & !fx_zero;
-    if(Q==AC_TRN_ZERO) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_u |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    e_t e = r_inf ? -1 : (!r_normal) ? 0 : exp + exp_bias;
-    m_u &= ac_int<1,true>(!r_inf);
-    e &= ac_int<1,true>(r_normal);
-    d = m_u.template slc<mant_bits>(0);
-    d.set_slc(mant_bits, e);
-    d[W-1] = sign;
-  }
-  template<ac_q_mode Q, int WI, bool SI>
-  void assign_from(const ac_int<WI,SI> &x) {
-    this->template assign_from<Q>(ac_fixed<WI,WI,SI>(x));
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  explicit ac_std_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
-    assign_from<AC_RND_CONV>(fx);
-  }
-  explicit ac_std_float(float f) {
-    const int w_bits = sizeof(f)*8;
-    const int m_bits = std::numeric_limits<float>::digits;
-    const int e_bits = w_bits - m_bits;
-    ac_int<w_bits,true> t_i;
-    ac::copy_bits(f, &t_i);
-    ac_std_float<w_bits,e_bits> t;
-    t.set_data(t_i);
-    *this = ac_std_float(t);
-  }
-  explicit ac_std_float(double f) {
-    const int w_bits = sizeof(f)*8;
-    const int m_bits = std::numeric_limits<double>::digits;
-    const int e_bits = w_bits - m_bits;
-    ac_int<w_bits,true> t_i;
-    ac::copy_bits(f, &t_i);
-    ac_std_float<w_bits,e_bits> t;
-    t.set_data(t_i);
-    *this = ac_std_float(t);
-  }
-  explicit ac_std_float(int x) {
-    *this = ac_std_float(ac_fixed<32,32,true>(x));
-  }
-  explicit ac_std_float(long long x) {
-    *this = ac_std_float(ac_fixed<64,64,true>(x));
-  }
-  const ac_int<W,true> &data() const { return d; }
-  void set_data(const ac_int<W,true> &data, bool assert_on_nan=false, bool assert_on_inf=false) {
-    d = data;
-    if(assert_on_nan)
-      AC_ASSERT(!isnan(), "Float is NaN");
-    if(assert_on_inf)
-      AC_ASSERT(!isinf(), "Float is Inf");
-  }
-  int fpclassify() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    if(e) {
-      if(e == -1)
-        return !(ac_int<mant_bits,false>)d ? FP_INFINITE : FP_NAN;
-      else
-        return FP_NORMAL;
-    }
-    else
-      return !(ac_int<mant_bits,false>)d ? FP_ZERO : FP_SUBNORMAL;
-  }
-  bool isfinite() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    return e != -1;
-  }
-  bool isnormal() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    return (e || !(ac_int<mant_bits,false>)d)&& e != -1;
-  }
-  bool isnan() const {
-    if(isfinite())
-      return false;
-    ac_int<mant_bits,false> m = d;
-    return !!m;
-  }
-  bool isinf() const {
-    if(isfinite())
-      return false;
-    ac_int<mant_bits,false> m = d;
-    return !m;
-  }
-  const ac_float<mant_bits+2,2,E,AC_TRN> to_ac_float() const {
-    ac_int<E,true> e = d.template slc<E>(mant_bits);
-    bool normal = !!e;
-    bool sign = d[W-1];
-    bool inf = e == -1;
-    ac_int<mant_bits,false> m = d;
-    ac_int<mant_bits+1,false> m1 = m;
-    m1[mant_bits] = normal;
-    ac_int<mant_bits+2,true> m_s = sign ? -m1 : (ac_int<mant_bits+2,true>) m1;
-    ac_fixed<mant_bits+2,2,true> fx = 0;
-    fx.set_slc(0, m_s);
-    e -= exp_bias;
-    // if number is subnormal, e will be MIN_EXP + 1 (10...01), but it needs to be
-    //   MIN_EXP + 2  (10...010)
-    e[0] = e[0] & normal;
-    e[1] = e[1] | !normal;
-    // normalization by at most 2 places
-    bool shiftl1 = !(fx[mant_bits+1] ^ fx[mant_bits]);
-    bool shiftl2 = shiftl1 & !(fx[mant_bits+1] ^ fx[mant_bits-1]);
-    fx <<= shiftl1;
-    fx <<= shiftl2;
-    e -= shiftl1 + shiftl2;
-    e = inf ? value<AC_VAL_MAX>(e) : e;
-    fx = inf ? (sign ? value<AC_VAL_MIN>(fx) : value<AC_VAL_MAX>(fx)) : fx;
-    return ac_float<mant_bits+2,2,E,AC_TRN>(fx, e, false);
-  }
-  float to_float() const {
-    ac_std_float<32,8> t(*this);
-    float f;
-    ac::copy_bits(t.d, &f);
-    return f;
-  }
-  double to_double() const {
-    ac_std_float<64,11> t(*this);
-    double f;
-    ac::copy_bits(t.d, &f);
-    return f;
-  }
-private:
-  void extract(mu_t &m, e_t &e, bool &sign, bool &normal, bool &zero, bool &inf, bool &nan, bool biased_exp=false, bool no_subnormals=false) const {
-    e = d.template slc<E>(mant_bits);
-    bool exception = e == -1;
-    normal = !!e | no_subnormals;
-    m = d;
-    bool m_zero = !m.template slc<mant_bits>(0);
-    zero = (!e) & (no_subnormals | m_zero);
-    m[mant_bits] = !!e;
-    if(!biased_exp) {
-      e -= exp_bias;
-      e += !normal;
-    }
-    sign = d[W-1];
-    inf = exception & m_zero;
-    nan = exception & !m_zero;
-  }
-public:
-  static ac_std_float zero() {
-    ac_std_float r;
-    r.d = 0;
-    return r;
-  }
-  static ac_std_float one() {
-    ac_std_float r;
-    r.d = 0;
-    r.d.set_slc(mant_bits, ac_int<E,false>(exp_bias));
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float add_generic(const ac_std_float &op2) const {
-    ac_private::check_supported<QR>();
-    // +1 for possible negation, +1 for bit growth due to addition
-    const int tr_t_iwidth = mu_bits + 1 + 1;
-    // extra bit for rounding, extra bit for left shift
-    const int tr_t_width = tr_t_iwidth + 1 + 1;
-    typedef ac_fixed<tr_t_width,tr_t_iwidth,true> add_t;
-    typedef ac_fixed<mu_bits,mu_bits+1,false> r_un_t;
-    e_t op1_e, op2_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan;
-    mu_t op1_mu, op2_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    m_t op1_m = op1_sign ? m_t(-op1_mu) : m_t(op1_mu);
-    op1_m &= m_t(No_SubNormals & op1_zero ? 0 : -1);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    m_t op2_m = op2_sign ? m_t(-op2_mu) : m_t(op2_mu);
-    op2_m &= m_t(No_SubNormals & op2_zero ? 0 : -1);
-
-    unsigned op1_e_b = ac_int<E,false>(op1_e) + !op1_normal;
-    unsigned op2_e_b = ac_int<E,false>(op2_e) + !op2_normal;
-    int e_dif = op1_e_b - op2_e_b;
-    bool e1_lt_e2 = e_dif < 0;
-    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
-
-    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
-    m_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
-    add_t shifted_out_bits = op_lshift;
-    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
-    bool sticky_bit = !!shifted_out_bits;
-
-    op_lshift >>= (unsigned) e_dif;
-    add_t add_r = op_lshift + op_no_shift;
-    int exp = ( (e1_lt_e2 & !op2_zero) | op1_zero ? op2_e_b : op1_e_b);
-    bool all_sign;
-    int ls = add_r.leading_sign(all_sign);
-    bool r_zero = !add_r[0] & all_sign;
-    // +1 to account for bit growth of add_r
-    int max_shift_left = exp + (- min_exp - exp_bias + 1);
-    bool shift_exponent_limited = ls >= max_shift_left;
-    int shift_l = shift_exponent_limited ? max_shift_left : ls;
-    add_r <<= shift_l;
-    add_r[0] = add_r[0] | sticky_bit;
-    ac_fixed<mu_bits+1,mu_bits+2,true,QR> r_rnd = add_r;
-    typedef ac_int<mu_bits+1,false> t_h;
-    t_h t = add_r.to_ac_int();
-    bool rnd_ovf = QR == AC_RND_CONV && t == t_h(-1);
-    bool r_sign = r_rnd[mu_bits] ^ rnd_ovf;
-    bool shift_r = rnd_ovf | (r_sign & !r_rnd.template slc<mu_bits>(0));
-    r_un_t r_un =  r_sign ? (r_un_t) -r_rnd : (r_un_t) r_rnd;
-    // get rid of implied bit, assign to ac_int
-    bool r_normal = r_un[mant_bits] | shift_r;
-    r_zero |= No_SubNormals & !r_normal;
-    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
-    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 1) + shift_r;
-    bool r_inf = exp > max_exp + exp_bias;
-    if(QR==AC_TRN_ZERO) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    bool r_nan = op1_nan | op2_nan | ((op1_inf & op2_inf) & (op1_sign ^ op2_sign));
-    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float add(const ac_std_float &op2) const {
-#ifndef AC_STD_FLOAT_ADD_OVERRIDE
-    return add_generic<QR,No_SubNormals>(op2);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_ADD_OVERRIDE<QR,No_SubNormals>(*this, op2);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float sub(const ac_std_float &op2) const {
-    return add<QR,No_SubNormals>(-op2);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float mult_generic(const ac_std_float &op2) const {
-    ac_private::check_supported<QR>();
-    e_t op1_e, op2_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan;
-    mu_t op1_mu, op2_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    bool r_sign = op1_sign ^ op2_sign;
-    bool r_nan = op1_nan | op2_nan | (op1_inf & op2_zero) | (op1_zero & op2_inf);
-    bool r_zero = op1_zero | op2_zero;  // r_nan takes precedence later on
-    int exp = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
-    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
-    int max_shift_left = exp + (- min_exp - exp_bias + 1);
-    int shift_l = 0;
-    bool shift_l_1 = false;
-    typedef ac_int<mu_bits+1,false> t_h;
-    typedef ac_int<mu_bits-1,false> t_l;
-    t_h p_h;
-    t_l p_l = p;
-    bool r_normal;
-    bool r_inf;
-    ac_fixed<mu_bits,mu_bits+2,false,QR> r_rnd;
-    ac_int<mant_bits,false> m_r;
-    if(max_shift_left >= 0) {
-      r_inf = exp > max_exp + exp_bias;
-      bool exp_is_max = exp == max_exp + exp_bias;
-      bool exp_is_max_m1 = exp == max_exp + exp_bias - 1;
-      unsigned ls = No_SubNormals ? 0 : (unsigned) (op1_normal ? op2_mu : op1_mu).leading_sign();
-      bool shift_exponent_limited = ls >= (unsigned) max_shift_left;
-      shift_l = shift_exponent_limited ? (unsigned) max_shift_left : ls;
-      p <<= (unsigned) shift_l;
-      exp -= shift_l;
-      shift_l_1 = !(shift_exponent_limited | p[2*mu_bits-1]);
-      p = shift_l_1 ? p << 1 : p;
-      exp += !shift_l_1;
-      p_h = p >> (mu_bits-1);
-      p_l &= (t_l(-1) >> shift_l) >> shift_l_1;
-      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
-      p_bef_rnd <<= 1;
-      p_bef_rnd[0] = !!p_l;
-      r_rnd = p_bef_rnd;
-      m_r = r_rnd.template slc<mant_bits>(0);
-      bool rnd_ovf = QR == AC_RND_CONV && p_h == t_h(-1);
-      exp += rnd_ovf;
-      r_inf |= (exp_is_max & (!shift_l_1 | rnd_ovf)) | (exp_is_max_m1 & !shift_l_1 & rnd_ovf);
-      r_normal = r_rnd[mant_bits] | rnd_ovf;
-      r_zero |= !r_normal & No_SubNormals;
-      if(QR==AC_TRN_ZERO) {
-        exp = r_inf ? max_exp + exp_bias : exp;
-        m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-        r_inf = false;
-      }
-    } else {
-      shift_l = max_shift_left;
-      exp -= shift_l;
-      unsigned shift_r_m1 = ~shift_l;
-      p_h = p >> (mu_bits-1);
-      t_h shifted_out_bits = p_h;
-      shifted_out_bits &= ~((~t_h(1)) << shift_r_m1);
-      p_h >>= shift_r_m1;
-      p_h >>= 1;
-      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
-      p_bef_rnd <<= 1;
-      p_bef_rnd[0] = !!p_l | !!shifted_out_bits;
-      r_rnd = p_bef_rnd;
-      m_r = r_rnd.template slc<mant_bits>(0);
-      r_normal = false;
-      r_inf = false;
-      r_zero |= No_SubNormals;
-    }
-    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float mult(const ac_std_float &op2) const {
-#ifndef AC_STD_FLOAT_MULT_OVERRIDE
-    return mult_generic<QR,No_SubNormals>(op2);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_MULT_OVERRIDE<QR,No_SubNormals>(*this, op2);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float div_generic(const ac_std_float &op2) const {
-    ac_private::check_supported<QR>();
-    e_t op1_e, op2_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan;
-    mu_t op1_mu, op2_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    bool r_sign = op1_sign ^ op2_sign;
-    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
-    op1_mu <<= ls_op1;
-    int ls_op2 = No_SubNormals ? 0 : (unsigned) op2_mu.leading_sign();
-    op2_mu <<= ls_op2;
-    int exp = ac_int<E,false>(op1_e) - ac_int<E,false>(op2_e) + !op1_normal - !op2_normal - ls_op1 + ls_op2 + exp_bias;
-    ac_int<mu_bits+2,false> q0 = 0;
-    bool exact = true;
-    bool div_by_zero = op2_zero;
-#ifdef __SYNTHESIS__
-    div_by_zero = false;
-#endif
-    if(!div_by_zero) {
-      AC_STD_FLOAT_FX_DIV_OVERRIDE(op1_mu, op2_mu, q0, exact);
-    }
-    ac_int<mu_bits+3,false> q = q0;
-    q <<= 1;
-    int shift_r = min_exp + exp_bias - exp;
-    bool sticky_bit = !exact;
-    if(shift_r >= 0) {
-      typedef ac_int<mu_bits+3,false> t_t;
-      t_t shifted_out_bits = q;
-      shifted_out_bits &= ~((~t_t(0)) << shift_r);
-      sticky_bit |= !!shifted_out_bits;
-      q >>= shift_r;
-      exp += shift_r;
-    } else {
-      bool shift_l = !q[mu_bits+2];
-      q <<= shift_l;
-      exp -= shift_l;
-    }
-    q[0] = q[0] | sticky_bit;
-    ac_fixed<mu_bits+1,mu_bits+4,false,QR> r_rnd = q;
-    bool rnd_ovf = r_rnd[mu_bits];
-    ac_int<mant_bits,false> m_r = r_rnd.template slc<mant_bits>(0);
-    bool r_normal = r_rnd[mant_bits] | rnd_ovf;
-    bool r_nan = op1_nan | op2_nan | (op1_zero & op2_zero) | (op1_inf & op2_inf);
-    bool r_zero = op1_zero | op2_inf;
-    r_zero |= !r_normal & No_SubNormals;
-    exp += rnd_ovf;
-    bool r_inf0 = op1_inf | op2_zero;  // this is not affected by rounding
-    bool r_inf = (!r_zero & (exp > max_exp + exp_bias)) | r_inf0;
-    if(QR==AC_TRN_ZERO && !r_inf0) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    bool exception = r_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float div(const ac_std_float &op2) const {
-#ifndef AC_STD_FLOAT_DIV_OVERRIDE
-    return div_generic<QR,No_SubNormals>(op2);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_DIV_OVERRIDE<QR,No_SubNormals>(*this, op2);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float fma_generic(const ac_std_float &op2, const ac_std_float &op3) const {
-    ac_private::check_supported<QR>();
-    e_t op1_e, op2_e, op3_e;
-    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero, op3_normal, op3_sign, op3_zero;
-    bool op1_inf, op1_nan, op2_inf, op2_nan, op3_inf, op3_nan;
-    mu_t op1_mu, op2_mu, op3_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
-    op3.extract(op3_mu, op3_e, op3_sign, op3_normal, op3_zero, op3_inf, op3_nan, true, No_SubNormals);
-    if(No_SubNormals)
-      op3_mu &= mu_t(op3_zero ? 0 : -1);
-    bool mult_sign = (op1_sign ^ op2_sign) | (op1_zero & op2_inf) | (op1_inf & op1_zero);
-    bool mult_nan = op1_nan | op2_nan | (op1_zero & op2_inf) | (op1_inf & op2_zero);
-    bool mult_zero = op1_zero | op2_zero;  // mult_nan has precedence later on
-    int mult_exp_b = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
-    mult_exp_b |= ac_int<E,false>( op1_inf | op2_inf ? -1 : 0 );
-    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
-    if(No_SubNormals)
-      p &= ac_int<2*mu_bits,false>(mult_zero ? 0 : -1);
-    bool mult_inf = op1_inf | op2_inf;
-
-    bool diff_signs = mult_sign ^ op3_sign;
-    bool toggle_r_sign = mult_sign;
-    m_t op3_m = diff_signs ? m_t(-op3_mu) : m_t(op3_mu);
-    unsigned op3_e_b = ac_int<E,false>(op3_e) + !op3_normal;
-
-    int e_dif = mult_exp_b - op3_e_b;
-    bool emult_lt_e3 = e_dif < 0;
-    e_dif = (mult_zero | op3_zero) ? 0 : emult_lt_e3 ? -e_dif : e_dif;
-
-    typedef ac_int<2*mu_bits+4,true> add_t;
-    add_t op3_m_s = op3_m;
-    op3_m_s <<= mu_bits+1;   // mult: ii.ffff, op3: i.ff
-    add_t p_s = p;
-    p_s <<= 2;
-    add_t op_lshift = emult_lt_e3 ? p_s : op3_m_s;
-    add_t op_no_shift = emult_lt_e3 ? op3_m_s : p_s;
-
-    add_t shifted_out_bits = op_lshift;
-    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
-    bool sticky_bit = !!shifted_out_bits;
-
-    op_lshift >>= (unsigned) e_dif;
-    add_t add_r = op_lshift + op_no_shift;
-    int exp = ( (emult_lt_e3 & !op3_zero) | mult_zero ? op3_e_b : mult_exp_b);
-
-    bool all_sign;
-    int ls = add_r.leading_sign(all_sign);
-    // no bit growth of add_r
-    int max_shift_left = exp + (- min_exp - exp_bias + 2);
-    bool shift_exponent_limited = ls >= max_shift_left;
-    int shift_l = shift_exponent_limited ? max_shift_left : ls;
-    add_r <<= shift_l;
-    add_r[0] = add_r[0] | sticky_bit;
-
-    ac_fixed<mu_bits+1,2*mu_bits+4,true,QR> r_rnd = add_r;
-
-    typedef ac_int<mu_bits+1,false> t_h;
-    t_h t = add_r.template slc<mu_bits+1>(mu_bits+2);
-    bool rnd_ovf = QR == AC_RND_CONV && !add_r[2*mu_bits+3] && t == t_h(-1);
-    bool r_neg = r_rnd[mu_bits] ^ rnd_ovf;
-    bool r_sign = op3_inf ? op3_sign : mult_inf ? mult_sign : r_neg ^ toggle_r_sign;
-    ac_int<mu_bits+1,true> r_rnd_i = r_rnd.template slc<mu_bits+1>(0);
-    bool r_zero = !rnd_ovf & !r_rnd_i;
-    bool shift_r = rnd_ovf | (r_neg & !r_rnd_i.template slc<mu_bits>(0));
-    typedef ac_int<mu_bits,false> r_un_t;
-    r_un_t r_un =  r_neg ? (r_un_t) -r_rnd_i : (r_un_t) r_rnd_i;
-    // get rid of implied bit, assign to ac_int
-    bool r_normal = r_un[mant_bits] | shift_r;
-    r_zero |= No_SubNormals & !r_normal;
-    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
-    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 2) + shift_r;
-    bool r_inf = mult_inf | op3_inf | (exp > max_exp + exp_bias);
-    if(QR==AC_TRN_ZERO) {
-      exp = r_inf ? max_exp + exp_bias : exp;
-      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
-      r_inf = false;
-    }
-    bool r_nan = op3_nan | mult_nan | ((op3_inf & (op1_inf | op2_inf)) & (op3_sign ^ mult_sign));
-    bool exception = op3_inf | mult_inf | op3_nan | mult_nan | r_inf;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    d_r[W-1] = r_sign;
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float fma(const ac_std_float &op2, const ac_std_float &op3) const {
-#ifndef AC_STD_FLOAT_FMA_OVERRIDE
-    return fma_generic<QR,No_SubNormals>(op2,op3);
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_FMA_OVERRIDE<QR,No_SubNormals>(*this,op2,op3);
-#endif
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float sqrt_generic() const {
-    ac_private::check_supported<QR>();
-    const bool rnd = QR != AC_TRN_ZERO;   // need msb(rounded bits)
-    const bool rbits = QR != AC_TRN_ZERO; // need bits after msb(rounded bits)
-    e_t op1_e;
-    bool op1_normal, op1_sign, op1_zero;
-    bool op1_inf, op1_nan;
-    mu_t op1_mu;
-    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
-    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
-    op1_mu <<= ls_op1;
-    op1_mu[mu_bits-1] = true;  // Since it is normalized, zero is captured by op1_zero
-
-    bool exp_odd = (op1_e  ^ !op1_normal ^ ls_op1 ^ exp_bias) & 1;
-
-    int exp = ac_int<E,false>(op1_e) + !op1_normal - ls_op1 - exp_bias;
-    exp >>= 1;   // divide by 2, truncate towards -inf
-
-    ac_int<mu_bits+1,false> op1_mi = op1_mu;
-    op1_mi <<= exp_odd;
-    ac_int<mu_bits+rnd,false> sq_rt;
-    bool sticky_bit = ac::fx_sqrt(op1_mi, sq_rt);
-    bool r_normal = true;  // true for most practical cases on W,E
-    if(mant_bits > -min_exp) {
-      int exp_over = min_exp - exp;
-      if(exp_over > 0) {
-        if(rbits) {
-          typedef ac_int<mu_bits+rnd,false> t_t;
-          t_t shifted_out_bits = sq_rt;
-          shifted_out_bits &= ~((~t_t(0)) << exp_over);
-          sticky_bit |= !!shifted_out_bits;
-        }
-        sq_rt >>= exp_over;
-        exp = min_exp;
-        r_normal = false;
-      }
-    }
-    // rounding should not trigger overflow (unless truncate towards +inf which is currently not supported)
-    ac_fixed<mu_bits+rnd+rbits,1,false> sq_rt_rnd = 0;
-    if(rbits)
-      sq_rt_rnd[0] = sq_rt_rnd[0] | sticky_bit;
-    sq_rt_rnd.set_slc(rbits, sq_rt);
-    ac_fixed<mu_bits,1,false,QR> sq_rt_fx = sq_rt_rnd;
-
-    ac_int<mant_bits,false> m_r = sq_rt_fx.template slc<mant_bits>(0);
-    bool r_nan = op1_nan | (op1_sign & !op1_zero);
-    bool r_zero = op1_zero;
-    r_zero |= !r_normal & No_SubNormals;
-    bool r_inf = op1_inf;
-    bool exception = r_nan | r_inf;
-    exp += exp_bias;
-    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
-    if(exception | r_zero) {
-      m_r = 0;
-      m_r[mant_bits-1] = r_nan;
-    }
-    ac_int<W,true> d_r = m_r;
-    d_r.set_slc(mant_bits, e_r);
-    ac_std_float r;
-    r.set_data(d_r);
-    return r;
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_std_float sqrt() const {
-#ifndef AC_STD_FLOAT_SQRT_OVERRIDE
-    return sqrt_generic<QR,No_SubNormals>();
-#else
-    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_SQRT_OVERRIDE<QR,No_SubNormals>(*this);
-#endif
-  }
-  ac_std_float operator +(const ac_std_float &op2) const {
-    return add<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float operator -(const ac_std_float &op2) const {
-    return sub<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float operator *(const ac_std_float &op2) const {
-    return mult<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float operator /(const ac_std_float &op2) const {
-    return div<AC_RND_CONV,false>(op2);
-  }
-  ac_std_float &operator +=(const ac_std_float &op2) {
-    *this = operator +(op2);
-    return *this;
-  }
-  ac_std_float &operator -=(const ac_std_float &op2) {
-    *this = operator -(op2);
-    return *this;
-  }
-  ac_std_float &operator *=(const ac_std_float &op2) {
-    *this = operator *(op2);
-  }
-  ac_std_float &operator /=(const ac_std_float &op2) {
-    *this = operator /(op2);
-    return *this;
-  }
-  bool operator ==(const ac_std_float &op2) const {
-    return ((d == op2.d) && !isnan()) || (operator !() && op2.operator !());
-  }
-  bool operator !=(const ac_std_float &op2) const {
-    return !operator ==(op2);
-  }
-  bool magnitude_lt(const ac_std_float &op2) const {
-    return ac_int<W-1,false>(d) < ac_int<W-1,false>(op2.d);
-  }
-  bool neg() const { return d[W-1]; }
-  bool operator <(const ac_std_float &op2) const {
-    return
-      operator !=(op2) && ( (neg() && !op2.neg()) || (!(neg() ^ op2.neg()) && neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator >=(const ac_std_float &op2) const {
-    return
-      (operator ==(op2) || (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator >(const ac_std_float &op2) const {
-    return
-      operator !=(op2)
-      && ( (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator <=(const ac_std_float &op2) const {
-    return
-      (operator == (op2) || (neg() && !op2.neg()) || (!neg() ^ op2.neg() && neg() ^ magnitude_lt(op2)) )
-      && !isnan() && !op2.isnan();
-  }
-  bool operator !() const { return !ac_int<W-1,false>(d); }
-  ac_std_float operator -() const {
-    ac_std_float r(*this);
-    r.d[W-1] = !d[W-1];
-    return r;
-  }
-  ac_std_float operator +() const {
-    return ac_std_float(*this);
-  }
-  ac_std_float abs() const {
-    ac_std_float r(*this);
-    r.d[W-1] = false;
-    return r;
-  }
-  ac_std_float copysign(const ac_std_float &op2) const {
-    ac_std_float r(*this);
-    r.d[W-1] = op2.d[W-1];
-    return r;
-  }
-  bool signbit() const {
-    return d[W-1];
-  }
-  void set_signbit(bool s) {
-    d[W-1] = s;
-  }
-  ac_std_float ceil() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    bool sign = d[W-1];
-    if(!d.template slc<W-1>(0))
-      return *this;
-    if(e < exp_bias) {
-      return sign ? zero() : one();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        bool non_zero_fractional = !!(m & ~mask);
-        if(!sign) {
-          m |= ~mask;
-          mu_t mu = m + mant_t(non_zero_fractional);
-          e += mu[mant_bits];
-          r.d.set_slc(mant_bits, e);
-          m = mu;
-        }
-        m &= mask;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-  ac_std_float floor() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    bool sign = d[W-1];
-    if(!d.template slc<W-1>(0))
-      return *this;
-    if(e < exp_bias) {
-      return sign ? -one() : zero();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        bool non_zero_fractional = !!(m & ~mask);
-        if(sign) {
-          m |= ~mask;
-          mu_t mu = m + mant_t(non_zero_fractional);
-          e += mu[mant_bits];
-          r.d.set_slc(mant_bits, e);
-          m = mu;
-        }
-        m &= mask;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-  ac_std_float trunc() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    if(e < exp_bias) {
-      return zero();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        m &= mask;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-  ac_std_float round() const {
-    ac_int<E,false> e = d.template slc<E>(mant_bits);
-    if(e < exp_bias-1) {
-      return zero();
-    } else {
-      ac_std_float r(*this);
-      int e_dif = mant_bits + exp_bias -1 - e;
-      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
-        return r;
-      else {
-        typedef ac_int<mant_bits,false> mant_t;
-        mant_t m = d;
-        mant_t mask = (~mant_t(0)) << e_dif;
-        m |= ~mask;
-        mu_t mu = m + mant_t(1);
-        e += mu[mant_bits];
-        r.d.set_slc(mant_bits, e);
-        m = mu;
-        m &= mask << 1;  // truncate fractional bits
-        r.d.set_slc(0, m);
-        return r;
-      }
-    }
-  }
-};
-
-template<int W, int E>
-inline std::ostream& operator << (std::ostream &os, const ac_std_float<W,E> &x) {
-  // for now just print the raw ac_int for it
-  os << x.data().to_string(AC_HEX);
-  return os;
-}
-
-namespace ac {
-  // Type punning: using memcpy to avoid strict aliasing
-  inline void copy_bits(float f, int *x) {
-    std::memcpy(x, &f, sizeof(int));
-  }
-  inline void copy_bits(double f, long long *x) {
-    std::memcpy(x, &f, sizeof(long long));
-  }
-  inline void copy_bits(int x, float *f) {
-    std::memcpy(f, &x, sizeof(float));
-  }
-  inline void copy_bits(long long x, double *f) {
-    std::memcpy(f, &x, sizeof(double));
-  }
-
-  inline void copy_bits(const ac_std_float<32,8> &x, float *f) {
-    copy_bits(x.data().to_int(), f);
-  }
-  inline void copy_bits(const ac_std_float<64,11> &x, double *f) {
-    copy_bits(x.data().to_int64(), f);
-  }
-}
-
-template<ac_ieee_float_format Format>
-class ac_ieee_float_base {
-public:
-  static const int width = 1 << ((int)Format + 4);
-  // exponents are {5,8,11,15,19}, but the first three are specialized elsewhere
-  static const int e_width = 11 + ((int)Format - binary64)*4; // 11, 15, 19
-  static const int lls = width >> 6;
-  typedef long long (data_t)[lls];
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-  typedef ac_std_float<width,e_width> helper_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) {
-    ac::copy_bits(f.d, &d);
-  }
-  explicit ac_ieee_float_base(const helper_t &op) {
-    ac::copy_bits(op.data(), &d);
-  }
-  explicit ac_ieee_float_base(double f);
-protected:
-  helper_t to_helper_t() const {
-    ac_int<width,true> dat;
-    ac::copy_bits(d, &dat);
-    helper_t x;
-    x.set_data(dat);
-    return x;
-  }
-public:
-  void set_data(const data_t &op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const { return d[lls-1] < 0; }
-  void set_signbit(bool s) {
-    ac_int<64,true> t(d[lls-1]);
-    t[63] = s;
-    d[lls-1] = t.to_int64();
-  }
-};
-
-template<ac_ieee_float_format Format>
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<Format> &x) {
-  // for now print the 128 and 256 as raw ac_int
-  os << x.data_ac_int().to_string(AC_HEX);
-  return os;
-}
-
-template<> class ac_ieee_float_base<binary16> {
-public:
-  static const int width = 16;
-  static const int e_width = 5;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-  typedef short data_t;
-  typedef ac_std_float<width,e_width> helper_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
-  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
-  explicit ac_ieee_float_base(float f) : d((short)ac_std_float<width,e_width>(f).data().to_int()) {}
-protected:
-  helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-  float to_float() const {
-    ac_std_float_t t;
-    t.set_data(this->data_ac_int());
-    return t.to_float();
-  }
-#if __cplusplus > 199711L
-  explicit operator float() const { return to_float(); }
-#endif
-  void set_data(short op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const { return d < 0; }
-  void set_signbit(bool s) {
-    ac_int<width,true> t(d);
-    t[width-1] = s;
-    d = t;
-  }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary16> &x) {
-  os << x.to_float();
-  return os;
-}
-
-struct float_helper {
-  float d;
-  float_helper() {}
-  float_helper(float f) { d = f; }
-  float_helper(const float_helper &f) { d = f.d; }
-  float_helper(const float_helper &f, bool no_subnormals) {
-    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
-  }
-  float_helper(const ac_std_float<32,8> &f) { set_data(f.data().to_int()); }
-  template<ac_q_mode Q>
-  float_helper(const ac_float<25,2,8,Q> &f) : d(f.to_float()) {}
-  const float &data() const { return d; }
-  void set_data(int data) { ac::copy_bits(data, &d); }
-  void set_data(float data) { d = data; }
-  operator float() const { return d; }
-  float to_float() const { return d; }
-  int fpclassify() const { return std::fpclassify(d); }
-  bool isfinite() const { return std::isfinite(d); }
-  bool isnormal() const { return std::isnormal(d); }
-  bool isinf() const { return std::isinf(d); }
-  bool isnan() const { return std::isnan(d); }
-  static float nan() { return ac_std_float<32,8>::nan().to_float(); }
-  static float inf() { return ac_std_float<32,8>::inf().to_float(); }
-  static float denorm_min() { return ac_std_float<32,8>::denorm_min().to_float(); }
-  static float min() { return ac_std_float<32,8>::min().to_float(); }
-  static float max() { return ac_std_float<32,8>::max().to_float(); }
-  static float epsilon() { return ac_std_float<32,8>::epsilon().to_float(); }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper add(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) + float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper sub(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) - float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper mult(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) * float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper div(const float_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( float_helper(*this, No_SubNormals) / float_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper fma(const float_helper &op2, const float_helper &op3) const {
-    ac_private::check_supported2<QR>();
-    return float_helper( ::fmaf(float_helper(*this, No_SubNormals), float_helper(op2, No_SubNormals), float_helper(op3, No_SubNormals)), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  float_helper sqrt() const {
-    ac_private::check_supported2<QR>();
-    return float_helper( ::sqrtf(float_helper(*this, No_SubNormals)), No_SubNormals);
-  }
-  float_helper ceil() const { return float_helper(std::ceil(d)); }
-  float_helper floor() const { return float_helper(std::floor(d)); }
-  float_helper trunc() const { return float_helper(::truncf(d)); }
-  float_helper round() const { return float_helper(::roundf(d)); }
-};
-
-template<> class ac_ieee_float_base<binary32> {
-public:
-  static const int width = 32;
-  static const int e_width = 8;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-  typedef float data_t;
-  typedef float_helper helper_t;
-#else
-  typedef int data_t;
-  typedef ac_std_float<width,e_width> helper_t;
-#endif
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
-  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
-  explicit ac_ieee_float_base(float f) { ac::copy_bits(f, &d); }
-protected:
-  helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-#if __cplusplus > 199711L
-  explicit operator float() const {
-    float f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-#endif
-  float to_float() const {
-    float f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-  void set_data(int op) { ac::copy_bits(op, &d); }
-  void set_data(float op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const {
-    int x; ac::copy_bits(d, &x);
-    return x < 0;
-  }
-  void set_signbit(bool s) {
-    ac_int<width,true> t;
-    ac::copy_bits(d, &t);
-    t[width-1] = s;
-    ac::copy_bits(t, &d);
-  }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary32> &x) {
-  os << x.to_float();
-  return os;
-}
-
-struct double_helper {
-  double d;
-  double_helper() {}
-  double_helper(double f) { d = f; }
-  double_helper(const float_helper &f) { d = f.d; }
-  double_helper(const double_helper &f, bool no_subnormals) {
-    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
-  }
-  double_helper(const ac_std_float<64,11> &f) { set_data(f.data().to_int64()); }
-  template<ac_q_mode Q>
-  double_helper(const ac_float<54,2,11,Q> &f) : d(f.to_double()) {}
-  const double &data() const { return d; }
-  void set_data(long long data) {
-    ac::copy_bits(data, &d);
-  }
-  void set_data(double data) { d = data; }
-  operator double() const { return d; }
-  double to_double() const { return d; }
-  int fpclassify() const { return std::fpclassify(d); }
-  bool isfinite() const { return std::isfinite(d); }
-  bool isnormal() const { return std::isnormal(d); }
-  bool isinf() const { return std::isinf(d); }
-  bool isnan() const { return std::isnan(d); }
-  static double nan() { return ac_std_float<64,11>::nan().to_double(); }
-  static double inf() { return ac_std_float<64,11>::inf().to_double(); }
-  static double denorm_min() { return ac_std_float<64,11>::denorm_min().to_double(); }
-  static double min() { return ac_std_float<64,11>::min().to_double(); }
-  static double max() { return ac_std_float<64,11>::max().to_double(); }
-  static double epsilon() { return ac_std_float<64,11>::epsilon().to_double(); }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper add(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) + double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper sub(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) - double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper mult(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) * double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper div(const double_helper &op2) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( double_helper(*this, No_SubNormals) / double_helper(op2, No_SubNormals), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper fma(const double_helper &op2, const double_helper &op3) const {
-    ac_private::check_supported2<QR>();
-    return double_helper( ::fma((double) double_helper(*this, No_SubNormals), (double) double_helper(op2, No_SubNormals), (double) double_helper(op3, No_SubNormals)), No_SubNormals);
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  double_helper sqrt() const {
-    ac_private::check_supported2<QR>();
-    return double_helper( ::sqrt((double) double_helper(*this, No_SubNormals)), No_SubNormals);
-  }
-  double_helper ceil() const { return double_helper(std::ceil(d)); }
-  double_helper floor() const { return double_helper(std::floor(d)); }
-  double_helper trunc() const { return double_helper(::trunc(d)); }
-  double_helper round() const { return double_helper(::round(d)); }
-};
-
-template<> class ac_ieee_float_base<binary64> {
-public:
-  static const int width = 64;
-  static const int e_width = 11;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-#ifdef AC_IEEE_FLOAT_USE_BUILTIN
-  typedef double data_t;
-  typedef double_helper helper_t;
-#else
-  typedef long long data_t;
-  typedef ac_std_float<width,e_width> helper_t;
-#endif
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  data_t d;
-  ac_ieee_float_base() {}
-  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
-  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
-  explicit ac_ieee_float_base(double f) { ac::copy_bits(f, &d); }
-protected:
-  helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-#if __cplusplus > 199711L
-  explicit operator double() const {
-    double f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-#endif
-  double to_double() const {
-    double f;
-    ac::copy_bits(d, &f);
-    return f;
-  }
-  void set_data(long long op) { ac::copy_bits(op, &d); }
-  void set_data(double op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<width,true> data_ac_int() const {
-    ac_int<width,true> x;
-    ac::copy_bits(d, &x);
-    return x;
-  }
-  bool signbit() const {
-    long long x; ac::copy_bits(d, &x);
-    return x < 0;
-  }
-  void set_signbit(bool s) {
-    ac_int<width,true> t;
-    ac::copy_bits(d, &t);
-    t[width-1] = s;
-    ac::copy_bits(t, &d);
-  }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary64> &x) {
-  os << x.to_double();
-  return os;
-}
-
-namespace ac_private {
-  template<ac_ieee_float_format Format, typename T2>
-  struct ac_ieee_float_constructor {};
-  template<> struct ac_ieee_float_constructor<binary16,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary16,double> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary32,float> {
-    typedef int type;
-  };
-  template<> struct ac_ieee_float_constructor<binary32,double> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary64,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary64,double> {
-    typedef int type;
-  };
-  template<> struct ac_ieee_float_constructor<binary128,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary128,double> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary256,float> {
-    typedef int type_explicit;
-  };
-  template<> struct ac_ieee_float_constructor<binary256,double> {
-    typedef int type_explicit;
-  };
-}
-
-template<ac_ieee_float_format Format>
-class ac_ieee_float : public ac_ieee_float_base<Format> {
-public:
-  typedef ac_ieee_float_base<Format> Base;
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type mult;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type plus;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus2;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type logic;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div;
-    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div2;
-  };
-  struct rt_unary {
-    typedef ac_ieee_float neg;
-    typedef ac_ieee_float mag_sqr;
-    typedef ac_ieee_float mag;
-  };
-  static const int width = Base::width;
-  static const int e_width = Base::e_width;
-  static const int lls = width >> 6;
-  typedef typename Base::data_t data_t;
-  typedef typename Base::helper_t helper_t;
-  typedef typename Base::ac_float_t ac_float_t;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-public:
-  static ac_ieee_float nan() { return ac_ieee_float(helper_t::nan()); }
-  static ac_ieee_float inf() { return ac_ieee_float(helper_t::inf()); }
-  static ac_ieee_float denorm_min() { return ac_ieee_float(helper_t::denorm_min()); }
-  static ac_ieee_float min() { return ac_ieee_float(helper_t::min()); }
-  static ac_ieee_float max() { return ac_ieee_float(helper_t::max()); }
-  static ac_ieee_float epsilon() { return ac_ieee_float(helper_t::epsilon()); }
-  static ac_ieee_float zero() { return ac_ieee_float(ac_std_float_t::zero()); }
-  static ac_ieee_float one() { return ac_ieee_float(ac_std_float_t::one()); }
-  ac_ieee_float() {}
-private:
-  ac_ieee_float(const Base &f) : Base(f) {}
-public:
-  ac_ieee_float(const ac_std_float<width,e_width> &f) : Base(f) {}
-  ac_ieee_float(const ac_ieee_float &f) : Base(f) {}
-  template<ac_ieee_float_format Format2>
-  explicit ac_ieee_float(const ac_ieee_float<Format2> &f) : Base(ac_std_float_t(f.to_ac_std_float())) {}
-  template<int W, int E>
-  explicit ac_ieee_float(const ac_std_float<W,E> &f) : Base(ac_std_float_t(f)) {}
-  explicit ac_ieee_float(const ac::bfloat16 &f);
-  explicit ac_ieee_float(const ac_float_t &f) : Base(ac_std_float_t(f)) {}
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  explicit ac_ieee_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) : Base(ac_std_float_t(fx)) {}
-  template<ac_q_mode Q>
-  explicit ac_ieee_float(const ac_float<width-e_width+1,2,e_width,Q> &f) : Base(ac_std_float_t(f)) {}
-  template<ac_ieee_float_format Format2>
-  ac_ieee_float<Format2> to_ac_ieee_float() const { return ac_ieee_float<Format2>(*this); }
-  const ac_float_t to_ac_float() const {
-    return to_ac_std_float().to_ac_float();
-  }
-  const ac_std_float<width,e_width> to_ac_std_float() const {
-    ac_std_float<width,e_width> r;
-    r.set_data(data_ac_int());
-    return r;
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
-    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
-  }
-  void set_data(const data_t &data) {
-    Base::set_data(data);
-  }
-  const ac_int<width,true> data_ac_int() const { return Base::data_ac_int(); }
-  const data_t &data() const { return Base::d; }
-  template<typename T>
-  ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type d = 0) : Base(ac_std_float_t(f)) {}
-  template<typename T>
-  explicit ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type_explicit d = 0) : Base(ac_std_float_t(f)) {}
-  explicit ac_ieee_float(int x) {
-    *this = ac_ieee_float(ac_fixed<32,32,true>(x));
-  }
-  explicit ac_ieee_float(long long x) {
-    *this = ac_ieee_float(ac_fixed<64,64,true>(x));
-  }
-  int fpclassify() const { return Base::to_helper_t().fpclassify(); }
-  bool isfinite() const { return Base::to_helper_t().isfinite(); }
-  bool isnormal() const { return Base::to_helper_t().isnormal(); }
-  bool isinf() const { return Base::to_helper_t().isinf(); }
-  bool isnan() const { return Base::to_helper_t().isnan(); }
-
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float add(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template add<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float sub(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template sub<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float mult(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template mult<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float div(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template div<QR,No_SubNormals>(op2.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float fma(const ac_ieee_float &op2, const ac_ieee_float &op3) const {
-    return ac_ieee_float(Base(Base::to_helper_t().template fma<QR,No_SubNormals>(op2.Base::to_helper_t(), op3.Base::to_helper_t())));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  ac_ieee_float sqrt() const {
-    return ac_ieee_float(Base(Base::to_helper_t().template sqrt<QR,No_SubNormals>()));
-  }
-
-  ac_ieee_float operator +(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() + op2.Base::to_helper_t()));
-  }
-  ac_ieee_float operator -(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() - op2.Base::to_helper_t()));
-  }
-  ac_ieee_float operator *(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() * op2.Base::to_helper_t()));
-  }
-  ac_ieee_float operator /(const ac_ieee_float &op2) const {
-    return ac_ieee_float(Base(Base::to_helper_t() / op2.Base::to_helper_t()));
-  }
-
-  ac_ieee_float &operator +=(const ac_ieee_float &op2) {
-    return *this = operator +(op2);
-  }
-  ac_ieee_float &operator -=(const ac_ieee_float &op2) {
-    return *this = operator -(op2);
-  }
-  ac_ieee_float &operator *=(const ac_ieee_float &op2) {
-    return *this = operator *(op2);
-  }
-  ac_ieee_float &operator /=(const ac_ieee_float &op2) {
-    return *this = operator /(op2);
-  }
-
-  bool operator ==(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() == op2.Base::to_helper_t();
-  }
-  bool operator !=(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() != op2.Base::to_helper_t();
-  }
-  bool operator <(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() < op2.Base::to_helper_t();
-  }
-  bool operator >=(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() >= op2.Base::to_helper_t();
-  }
-  bool operator >(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() > op2.Base::to_helper_t();
-  }
-  bool operator <=(const ac_ieee_float &op2) const {
-    return Base::to_helper_t() <= op2.Base::to_helper_t();
-  }
-
-  ac_ieee_float operator -() const {
-    ac_ieee_float r(*this);
-    r.set_signbit(!this->signbit());
-    return r;
-  }
-  ac_ieee_float operator +() const {
-    return ac_ieee_float(*this);
-  }
-  ac_ieee_float abs() const {
-    ac_ieee_float r(*this);
-    r.set_signbit(false);
-    return r;
-  }
-  ac_ieee_float copysign(const ac_ieee_float &op2) const {
-    ac_ieee_float r(*this);
-    r.set_signbit(this->signbit());
-    return r;
-  }
-  bool signbit() const { return Base::signbit(); }
-  ac_ieee_float add(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 + op2;
-  }
-  ac_ieee_float ceil() const {
-    return ac_ieee_float(Base(Base::to_helper_t().ceil()));
-  }
-  ac_ieee_float floor() const {
-    return ac_ieee_float(Base(Base::to_helper_t().floor()));
-  }
-  ac_ieee_float trunc() const {
-    return ac_ieee_float(Base(Base::to_helper_t().trunc()));
-  }
-  ac_ieee_float round() const {
-    return ac_ieee_float(Base(Base::to_helper_t().round()));
-  }
-  ac_ieee_float sub(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 - op2;
-  }
-  ac_ieee_float mult(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 * op2;
-  }
-  ac_ieee_float div(const ac_ieee_float &op1, const ac_ieee_float &op2) {
-    return *this = op1 / op2;
-  }
-};
-
-template<ac_ieee_float_format Format>
-inline std::ostream& operator << (std::ostream &os, const ac_ieee_float<Format> &x) {
-  os << (const ac_ieee_float_base<Format>&) x;
-  return os;
-}
-
-namespace ac {
-class bfloat16 {
-public:
-  template<typename T>
-  struct rt_T {
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type mult;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type plus;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus2;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type logic;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div;
-    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div2;
-  };
-  struct rt_unary {
-    typedef bfloat16 neg;
-    typedef bfloat16 mag_sqr;
-    typedef bfloat16 mag;
-  };
-  static const int width = 16;
-  static const int e_width = 8;
-  static bfloat16 nan() { return bfloat16(helper_t::nan()); }
-  static bfloat16 inf() { return bfloat16(helper_t::inf()); }
-  static bfloat16 denorm_min() { return bfloat16(helper_t::denorm_min()); }
-  static bfloat16 min() { return bfloat16(helper_t::min()); }
-  static bfloat16 max() { return bfloat16(helper_t::max()); }
-  static bfloat16 epsilon() { return bfloat16(helper_t::epsilon()); }
-  static bfloat16 zero() { return bfloat16(ac_std_float_t::zero()); }
-  static bfloat16 one() { return bfloat16(ac_std_float_t::one()); }
-  typedef ac_std_float<width,e_width> helper_t;
-  typedef short data_t;
-  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
-  typedef ac_std_float<width,e_width> ac_std_float_t;
-  data_t d;
-  bfloat16() {}
-  bfloat16(const bfloat16 &f) : d(f.d) {}
-  bfloat16(const ac_std_float_t &op) : d(op.data()) {}
-  bfloat16(float f) { int x; ac::copy_bits(f, &x); d = (short) (x >> 16); }
-  template<int W2>
-  explicit bfloat16(const ac_std_float<W2,e_width> &f) {
-    *this = f.template convert<width,AC_TRN_ZERO>();
-  }
-  template<int W2,int E2>
-  explicit bfloat16(const ac_std_float<W2,E2> &f) {
-    *this = f.template convert<width,e_width,AC_TRN_ZERO>();
-  }
-  template<ac_ieee_float_format Format>
-  explicit bfloat16(const ac_ieee_float<Format> &f) {
-    *this = f.to_ac_std_float().template convert<width,e_width,AC_TRN_ZERO>();
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  explicit bfloat16(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
-    ac_std_float_t x;
-    x.assign_from<AC_TRN_ZERO>(fx);
-    *this = x;
-  }
-private:
-  const helper_t to_helper_t() const {
-    helper_t x;
-    x.set_data(d);
-    return x;
-  }
-public:
-  const ac_std_float_t to_ac_std_float() const {
-    ac_std_float_t x;
-    x.set_data(d);
-    return x;
-  }
-  const ac_float_t to_ac_float() const {
-    return ac_std_float_t().to_ac_float();
-  }
-  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
-  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
-    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
-  }
-  float to_float() const {
-    return to_ac_std_float().to_float();
-  }
-  double to_double() const {
-    return to_ac_std_float().to_double();
-  }
-  // operator is efficient since E is identical and mantissa is longer
-#if __cplusplus > 199711L
-  explicit operator float() const { return to_float(); }
-#endif
-  int fpclassify() const { return to_helper_t().fpclassify(); }
-  bool isfinite() const { return to_helper_t().isfinite(); }
-  bool isnormal() const { return to_helper_t().isnormal(); }
-  bool isinf() const { return to_helper_t().isinf(); }
-  bool isnan() const { return to_helper_t().isnan(); }
-  void set_data(short op) { ac::copy_bits(op, &d); }
-  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
-  const data_t &data() const { return d; }
-  ac_int<16,true> data_ac_int() const { return ac_int<16,true>(d); }
-
-  // mirroed most constructors in tensorflow implementation (except template version)
-  //   tensorflow uses static_cast<float>
-  //   this implementation goes through ac_std_float so there is no dependency on rounding mode
-//  template <class T>
-//  explicit bfloat16(const T& val) { *this = bfloat16(static_cast<float>(val)); }
-  explicit bfloat16(unsigned short val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<16,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(int val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<32,true>(val) );
-    *this = t;
-  }
-  explicit bfloat16(unsigned int val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<32,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(long val) {
-    const int long_w = ac_private::long_w;
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<long_w,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(long long val) {
-    ac_std_float_t t;
-    t.assign_from<AC_TRN_ZERO>( ac_int<64,false>(val) );
-    *this = t;
-  }
-  explicit bfloat16(double val) { *this = bfloat16(ac_ieee_float<binary64>(val)); }
-
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 add(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().add<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 sub(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().sub<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 mult(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().mult<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 div(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().div<QR,No_SubNormals>(op2.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 fma(const bfloat16 &op2, const bfloat16 &op3) const {
-    return bfloat16(to_helper_t().fma<QR,No_SubNormals>(op2.to_helper_t(), op3.to_helper_t()));
-  }
-  template<ac_q_mode QR, bool No_SubNormals>
-  bfloat16 sqrt() const {
-    return bfloat16(to_helper_t().sqrt<QR,No_SubNormals>());
-  }
-
-  bfloat16 operator +(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().add<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 operator -(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().sub<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 operator *(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().mult<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 operator /(const bfloat16 &op2) const {
-    return bfloat16(to_helper_t().div<AC_TRN_ZERO,false>(op2.to_helper_t()));
-  }
-  bfloat16 &operator +=(const bfloat16 &op2) {
-    return *this = operator +(op2);
-  }
-  bfloat16 &operator -=(const bfloat16 &op2) {
-    return *this = operator -(op2);
-  }
-  bfloat16 &operator *=(const bfloat16 &op2) {
-    return *this = operator *(op2);
-  }
-  bfloat16 &operator /=(const bfloat16 &op2) {
-    return *this = operator /(op2);
-  }
-
-  bool operator ==(const bfloat16 &op2) const {
-    return to_helper_t() == op2.to_helper_t();
-  }
-  bool operator !=(const bfloat16 &op2) const {
-    return to_helper_t() != op2.to_helper_t();
-  }
-  bool operator <(const bfloat16 &op2) const {
-    return to_helper_t() < op2.to_helper_t();
-  }
-  bool operator >=(const bfloat16 &op2) const {
-    return to_helper_t() >= op2.to_helper_t();
-  }
-  bool operator >(const bfloat16 &op2) const {
-    return to_helper_t() > op2.to_helper_t();
-  }
-  bool operator <=(const bfloat16 &op2) const {
-    return to_helper_t() <= op2.to_helper_t();
-  }
-
-  bfloat16 operator -() const {
-    bfloat16 r(*this);
-    r.set_signbit(!this->signbit());
-    return r;
-  }
-  bfloat16 operator +() const {
-    return bfloat16(*this);
-  }
-  bfloat16 abs() const {
-    bfloat16 r(*this);
-    r.set_signbit(false);
-    return r;
-  }
-  bfloat16 copysign(const bfloat16 &op2) const {
-    bfloat16 r(*this);
-    r.set_signbit(this->signbit());
-    return r;
-  }
-  bool signbit() const { return d < 0; }
-  void set_signbit(bool s) {
-    ac_int<width,true> t(d);
-    t[width-1] = s;
-    d = t;
-  }
-  bfloat16 ceil() const { return to_helper_t().ceil(); }
-  bfloat16 floor() const { return to_helper_t().floor(); }
-  bfloat16 trunc() const { return to_helper_t().trunc(); }
-  bfloat16 round() const { return to_helper_t().round(); }
-};
-
-inline std::ostream& operator << (std::ostream &os, const ac::bfloat16 &x) {
-  os << x.to_float();
-  return os;
-}
-
-}
-
-template<int W, int E>
-template<ac_ieee_float_format Format>
-inline ac_std_float<W,E>::ac_std_float(const ac_ieee_float<Format> &f) {
-  *this = ac_std_float(f.to_ac_std_float());
-}
-
-template<int W, int E>
-inline ac_std_float<W,E>::ac_std_float(const ac::bfloat16 &f) {
-  *this = ac_std_float(f.to_ac_std_float());
-}
-
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format>::ac_ieee_float(const ac::bfloat16 &f) {
-  *this = ac_ieee_float(f.to_ac_std_float());
-}
-
-typedef ac_ieee_float<binary16> ac_ieee_float16;
-typedef ac_ieee_float<binary32> ac_ieee_float32;
-typedef ac_ieee_float<binary64> ac_ieee_float64;
-typedef ac_ieee_float<binary128> ac_ieee_float128;
-typedef ac_ieee_float<binary256> ac_ieee_float256;
-
-
-#ifdef __AC_NAMESPACE
-}
-#endif
-
-// Global functions for ac_ieee_float
-namespace std {
-#ifdef __AC_NAMESPACE
-using namespace __AC_NAMESPACE;
-#endif
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format> abs(const ac_ieee_float<Format> &x) { return x.abs(); }
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format> fabs(const ac_ieee_float<Format> &x) { return x.abs(); }
-
-template<ac_ieee_float_format Format>
-inline ac_ieee_float<Format> copysign(const ac_ieee_float<Format> &x, const ac_ieee_float<Format> &y) { return x.copysign(y); }
-
-template<ac_ieee_float_format Format>
-inline int fpclassify(const ac_ieee_float<Format> &x) { return x.fpclassify(); }
-template<ac_ieee_float_format Format>
-inline bool isfinite(const ac_ieee_float<Format> &x) { return x.isfinite(); }
-template<ac_ieee_float_format Format>
-inline bool isnormal(const ac_ieee_float<Format> &x) { return x.isnormal(); }
-template<ac_ieee_float_format Format>
-inline bool isinf(const ac_ieee_float<Format> &x) { return x.isinf(); }
-template<ac_ieee_float_format Format>
-inline bool isnan(const ac_ieee_float<Format> &x) { return x.isnan(); }
-
-// Don't do "long double" versions since they are 80-bits, it is an extended presicion
-// TODO: fmod, fmodf, fmodl
-// TODO: fmod, remainder, remquo, fma, fmax, fmin, fdim
-// remainder(x,y),  x - n*y, where n = x/y rounded to the nearest integer (RND_CONV)
-// remquo(x,y, int *quo),  returns same as remainder, unclear what quo is, also Nan, inf etc
-// fmax, fmin:  if one number is Nan, the other is returned
-// fdim(x,y) returns max(x-y,0), if x or y is NaN, a NaN is returned, if result overflows, HUGE_VAL is returned
-// TODO: ceil, floor, trunc, round, lround, nearbyint, rint, lrint, llround, llrint
-// if x is +0, -0, NaN or Inf, x is returned
-//   ceil(x), floor(x), trunc(x)
-//   round(x) : RND_INF
-//   nearbyint: depends on rounding mode
-//   rint, same as nearbyint, but may raise inexaxt exception (FE_INEXACT)
-// TODO: frexp, ldexp, modf, nextafter, nexttoward, copysign
-// modf(x, *iptr), modff   break into integral (*iptr) and fractional (returned) values,
-// Don't cause exception: isgreater, isgreaterequal, isless, islessequal, islessgreater, isunordered
-//  isunordered: x or y is NaN
-template<ac_ieee_float_format Format>
-inline bool signbit(const ac_ieee_float<Format> &x) { return x.signbit(); }
-
-// Global functions for bfloat16
-inline bool signbit(const ac::bfloat16 &x) { return x.signbit(); }
-
-inline int fpclassify(const ac::bfloat16 &x) { return x.fpclassify(); }
-inline bool isfinite(const ac::bfloat16 &x) { return x.isfinite(); }
-inline bool isnormal(const ac::bfloat16 &x) { return x.isnormal(); }
-inline bool isinf(const ac::bfloat16 &x) { return x.isinf(); }
-inline bool isnan(const ac::bfloat16 &x) { return x.isnan(); }
-}
-
-#undef __AC_DATA_PRIVATE
-#undef AC_STD_FLOAT_FX_DIV_OVERRIDE
-
-#endif
+/**************************************************************************
+ *                                                                        *
+ *  Algorithmic C (tm) Datatypes                                          *
+ *                                                                        *
+ *  Software Version: 4.0                                                 *
+ *                                                                        *
+ *  Release Date    : Sat Jun 13 12:35:18 PDT 2020                        *
+ *  Release Type    : Production Release                                  *
+ *  Release Build   : 4.0.0                                               *
+ *                                                                        *
+ *  Copyright 2018-2020, Mentor Graphics Corporation,                     *
+ *                                                                        *
+ *  All Rights Reserved.                                                  *
+ *                                                                        *
+ **************************************************************************
+ *  Licensed under the Apache License, Version 2.0 (the "License");       *
+ *  you may not use this file except in compliance with the License.      *
+ *  You may obtain a copy of the License at                               *
+ *                                                                        *
+ *      http://www.apache.org/licenses/LICENSE-2.0                        *
+ *                                                                        *
+ *  Unless required by applicable law or agreed to in writing, software   *
+ *  distributed under the License is distributed on an "AS IS" BASIS,     *
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or       *
+ *  implied.                                                              *
+ *  See the License for the specific language governing permissions and   *
+ *  limitations under the License.                                        *
+ **************************************************************************
+ *                                                                        *
+ *  The most recent version of this package is available at github.       *
+ *                                                                        *
+ *************************************************************************/
+
+/*  Source:         ac_std_float.h
+ *  Description:    class for floating point operation handling in C++
+ *  Author:         Andres Takach, Ph.D.
+
+Overview: this header defines three classes
+
+  ac_ieee_float<Format>
+    Meant to store floats in IEEE standard binary format
+    Format indicate width:
+      binary16: (half float) uses short
+      binary32: (float) uses int
+      binary64: (double) uses array of long long with one element
+      binary128: (long double in some platforms) uses array of long long with two elements
+      binary256: uses array of long long with four elements
+
+  ac::bfloat16
+    Implements Google's tensorflow::bfloat16
+    Stores data as "short"
+
+  ac_std_float<W,E>
+    Superset of ac_ieee_float in that any bit width and exponent width is
+      allowed
+    This is used by ac_ieee_float and ac::bfloat16
+
+    Uses an ac_int<W,true> that holds the bit pattern for a standard (IEEE) style binary
+    float:
+         1) sign-magnitude representation, sign is MSB
+         2) mantissa (significand) with implied bit for normal numbers
+         3) E is not restricted to IEEE widths, another class ac_ieee_float does that
+
+    Provides easy way to conver to/from the closest covering ac_float:
+      Constructor from ac_float
+        Most two negative exponents of ac_float are not representable: shift
+          significand futher to the right (for now no attempt to round)
+        Most negative mantissa of ac_float (in two's complement) when converted
+          to sign-magnitute requires a right shift (add +1 to exponent)
+          If exponent is already max, two alternatives:
+            - "saturate" (store most negative number)
+            - Store as -Inf  (currently this option not available)
+        Exponent is offset
+        Mantissa implied bit is removed from normal numbers
+
+      Explicit convertion to_ac_float
+        Ignores exceptions (Inf, NaN)
+        Does inverse as above to obtain ac_float
+*/
+
+#ifndef __AC_STD_FLOAT_H
+#define __AC_STD_FLOAT_H
+#include <ac_float.h>
+#include <cstring>
+// Inclusion of cmath undefs all macros such as signbit etc that some parsers may define for C
+#include <cmath>
+
+#ifdef __SYNTHESIS__
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+#undef AC_IEEE_FLOAT_USE_BUILTIN
+#endif
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+// For now make data members public since SCVerify needs it
+//#ifdef __AC_MAKE_PRIVATE_DATA_PUBLIC
+#if 1
+#define __AC_DATA_PRIVATE public:
+#else
+#define __AC_DATA_PRIVATE private:
+#endif
+
+namespace ac_private {
+  template<bool cond>
+  struct check_rounding { enum {Only_symmetrical_roundings_or_truncations_supported}; };
+  template<> struct check_rounding<false> {};
+
+  template<ac_q_mode Q>
+  void check_supported() {
+    // only symmetrical roundings supported
+    const bool supported = Q==AC_RND_CONV || Q==AC_TRN_ZERO || Q==AC_RND_INF || Q == AC_RND_CONV_ODD;
+#if __cplusplus > 199711L
+    static_assert(supported, "Only symmetrical roundings/truncations supported");
+#else
+    (void) check_rounding<supported>::Only_symmetrical_roundings_or_truncations_supported;
+#endif
+  }
+
+  template<bool cond>
+  struct check_rounding2 { enum {Only_round_to_even_supported_when_using_BUILTIN}; };
+  template<> struct check_rounding2<false> {};
+
+  template<ac_q_mode Q>
+  void check_supported2() {
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+    const bool supported = Q==AC_RND_CONV;
+#if __cplusplus > 199711L
+    static_assert(supported, "Only round to even supported");
+#else
+    (void) check_rounding2<supported>::Only_round_to_even_supported_when_using_BUILTIN;
+#endif
+#endif
+  }
+
+  template<typename T, typename T2>
+  struct rt_closed_T {
+  };
+  template<typename T>
+  struct rt_closed_T<T,T> {
+    typedef T type;
+  };
+}
+
+namespace ac {
+  #pragma hls_design ccore
+  #pragma hls_ccore_type sequential
+  template<int W>
+  void fx_div(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
+    ac_int<W+2,true> R = op1;
+    bool R_neg = false;
+    ac_int<W,false> D = op2;
+    ac_int<W+1,true> neg_D = -D;
+    ac_int<W+2,false> Q = 0;
+    for(int i=0; i < W+2; i++) {
+      // take MSB of N, shift it in from right to R
+      R += ( R_neg ? (ac_int<W+1,true>) D : neg_D );
+      Q = (Q << 1) | ((R >= 0) & 1);
+      R_neg = R[W];
+      R <<= 1;
+    }
+    quotient = Q;
+    exact = !R | R_neg & (R >> 1) == neg_D;
+  }
+
+  template<int W>
+  void fx_div_sim(ac_int<W,false> op1, ac_int<W,false> op2, ac_int<W+2,false> &quotient, bool &exact) {
+    // need to compute extra rnd bit,
+    //   +2 because we may need to shift left by 1 (mant divisor > mant dividend)
+    ac_int<2*W+1,false> op1_mi = op1;
+    op1_mi <<= W+1;
+    // +1 bit to compute rnd bit
+    quotient = (op1_mi / op2);
+    exact = !(op1_mi % op2);
+  }
+
+  #pragma hls_design ccore
+  #pragma hls_ccore_type sequential
+  template<int W, int WR>
+  bool fx_sqrt( ac_int<W,false> x, ac_int<WR,false> &sqrt) {
+    // x is ac_fixed<W,2,false>, sqrt is ac_fixed<WR,1,false>
+    const bool W_odd = W&1;
+    const int ZW = W + W_odd;  // make it even
+    ac_int<ZW,false> z = x;
+    z <<= W_odd;
+    // masks used only to hint synthesis on precision
+    ac_int<WR+2,false> mask_d = 0;
+    ac_int<WR+2,false> d = 0;
+    ac_int<WR,false> r = 0;
+    unsigned int z_shift = ZW-2;
+    for(int i = WR-1; i >= 0; i--) {
+      r <<= 1;
+      mask_d = (mask_d << 2) | 0x3;
+      d = (mask_d & (d << 2)) | ((z >> z_shift) & 0x3 );
+      ac_int<WR+2,false> t = d - (( ((ac_int<WR+1,false>)r) << 1) | 0x1);
+      if( !t[WR+1] ) {  // since t is unsigned, look at MSB
+        r |= 0x1;
+        d = mask_d & t;
+      }
+      z <<= 2;
+    }
+
+    bool rem = (d != 0) || ((z >> 2*W) != 0);
+    sqrt = r;
+    return rem;
+  }
+}
+
+#ifndef AC_STD_FLOAT_FX_DIV_OVERRIDE
+#ifdef __SYNTHESIS__
+#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div
+#else
+#define AC_STD_FLOAT_FX_DIV_OVERRIDE ac::fx_div_sim
+#endif
+#endif
+
+template<int W, int E> class ac_std_float;
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
+#define AC_STD_FLOAT_OVERRIDE_NS ::AC_STD_FLOAT_OVERRIDE_NAMESPACE::
+namespace AC_STD_FLOAT_OVERRIDE_NAMESPACE {
+#ifdef __AC_NAMESPACE
+  using __AC_NAMESPACE::ac_q_mode;
+  using __AC_NAMESPACE::ac_std_float;
+#endif
+#else
+#define AC_STD_FLOAT_OVERRIDE_NS
+#endif
+
+#ifdef AC_STD_FLOAT_ADD_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_ADD_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
+#endif
+
+#ifdef AC_STD_FLOAT_MULT_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_MULT_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
+#endif
+
+#ifdef AC_STD_FLOAT_DIV_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_DIV_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2);
+#endif
+
+#ifdef AC_STD_FLOAT_FMA_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_FMA_OVERRIDE(const ac_std_float<W,E> &op, const ac_std_float<W,E> &op2, const ac_std_float<W,E> &op3);
+#endif
+
+#ifdef AC_STD_FLOAT_SQRT_OVERRIDE
+template<ac_q_mode QR, bool No_SubNormals, int W, int E>
+ac_std_float<W,E> AC_STD_FLOAT_SQRT_OVERRIDE(const ac_std_float<W,E> &op);
+#endif
+
+#ifdef AC_STD_FLOAT_OVERRIDE_NAMESPACE
+}
+#endif
+
+#ifdef __AC_NAMESPACE
+namespace __AC_NAMESPACE {
+#endif
+
+namespace ac {
+  inline void copy_bits(float a, float *b) { *b = a; }
+  inline void copy_bits(double a, double *b) { *b = a; }
+
+  inline void copy_bits(short a, short *b) { *b = a; }
+  inline void copy_bits(const ac_int<16,true> &a, short *b) { *b = (short) a.to_int(); }
+  inline void copy_bits(short a, ac_int<16,true> *b) { *b = ac_int<16,true>(a); }
+  inline void copy_bits(int a, int *b) { *b = a; }
+  inline void copy_bits(const ac_int<32,true> &a, int *b) { *b = a.to_int(); }
+  inline void copy_bits(int a, ac_int<32,true> *b) { *b = ac_int<32,true>(a); }
+  inline void copy_bits(long long a, long long *b) { *b = a; }
+  inline void copy_bits(const ac_int<64,true> &a, long long *b) { *b = a.to_int64(); }
+  inline void copy_bits(long long a, ac_int<64,true> *b) { *b = ac_int<64,true>(a); }
+  inline void copy_bits(const long long a[2], long long (*b)[2]) {
+    (*b)[0] = a[0];
+    (*b)[1] = a[1];
+  }
+  inline void copy_bits(const ac_int<128,true> &a, long long (*b)[2]) {
+    (*b)[0] = a.to_int64();
+    (*b)[1] = a.slc<64>(64).to_int64();
+  }
+  inline void copy_bits(const long long a[2], ac_int<128,true> *b) {
+    *b = 0;
+    b->set_slc(0,ac_int<64,true>(a[0]));
+    b->set_slc(64,ac_int<64,true>(a[1]));
+  }
+  inline void copy_bits(const long long a[4], long long (*b)[4]) {
+    (*b)[0] = a[0];
+    (*b)[1] = a[1];
+    (*b)[2] = a[2];
+    (*b)[3] = a[3];
+  }
+  inline void copy_bits(const ac_int<256,true> &a, long long (*b)[4]) {
+    (*b)[0] = a.to_int64();
+    (*b)[1] = a.slc<64>(64).to_int64();
+    (*b)[2] = a.slc<64>(128).to_int64();
+    (*b)[3] = a.slc<64>(192).to_int64();
+  }
+  inline void copy_bits(const long long a[4], ac_int<256,true> *b) {
+    *b = 0;
+    b->set_slc(0,ac_int<64,true>(a[0]));
+    b->set_slc(64,ac_int<64,true>(a[1]));
+    b->set_slc(128,ac_int<64,true>(a[2]));
+    b->set_slc(192,ac_int<64,true>(a[3]));
+  }
+  inline void copy_bits(float f, int *x);
+  inline void copy_bits(double f, long long *x);
+  inline void copy_bits(int x, float *f);
+  inline void copy_bits(long long x, double *f);
+
+  inline void copy_bits(float f, ac_int<32,true> *x) {
+    int x_i;
+    copy_bits(f, &x_i);
+    *x = x_i;
+  }
+  inline void copy_bits(double f, ac_int<64,true> *x) {
+    long long x_i;
+    copy_bits(f, &x_i);
+    *x = x_i;
+  }
+  inline void copy_bits(const ac_int<32,true> &x, float *f) { copy_bits(x.to_int(), f); }
+  inline void copy_bits(const ac_int<64,true> &x, double *f) { copy_bits(x.to_int64(), f); }
+}
+
+enum ac_ieee_float_format { binary16, binary32, binary64, binary128, binary256};
+
+// Forward declarations for ac_ieee_float and bfloat16
+template<ac_ieee_float_format Format>
+class ac_ieee_float;
+namespace ac {
+  class bfloat16;
+}
+
+template<int W, int E>
+class ac_std_float {
+__AC_DATA_PRIVATE
+  ac_int<W,true> d;
+public:
+  static const int width = W;
+  static const int e_width = E;
+  static const int mant_bits = W - E - 1;
+  static const int exp_bias = (1 << (E-1)) - 1;
+  static const int min_exp = -exp_bias + 1;
+  static const int max_exp = exp_bias;
+  static const int mu_bits = mant_bits + 1;
+private:
+  typedef ac_int<mu_bits,false> mu_t;
+  typedef ac_int<mu_bits+1,false> mu1_t;
+  typedef ac_int<mu_bits+2,false> mu2_t;
+  typedef ac_int<mu_bits+1,true> m_t;   // mantissa in two's complement representation
+public:
+  typedef ac_int<E,true> e_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  static ac_std_float nan() {
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits-1, ac_int<e_width+1,true>(-1));
+    return r;
+  }
+  static ac_std_float inf() {
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits, ac_int<e_width,true>(-1));
+    return r;
+  }
+  static ac_std_float denorm_min() {   // smallest positive non zero value (subnorm if supported)
+    ac_std_float r;
+    r.d = 1;
+    return r;
+  }
+  static ac_std_float min() {   // smallest NORMAL positive non zero value
+    ac_std_float r;
+    r.d = 0;
+    r.d[width-1-e_width] = true;
+    return r;
+  }
+  static ac_std_float max() {   // largest pos finite value
+    ac_std_float r;
+    r.d = -1;
+    r.d[width-1] = false;
+    r.d[width-1-e_width] = false;
+    return r;
+  }
+  static ac_std_float epsilon() {
+    ac_int<e_width,true> exp = -mant_bits + exp_bias;
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits, exp);
+    return r;
+  }
+  ac_std_float() {}
+  ac_std_float(const ac_std_float &f) : d(f.d) {}
+  template<int WR, ac_q_mode QR>
+  ac_std_float<WR,E> convert() const {
+    ac_private::check_supported<QR>();
+    ac_std_float<WR,E> r;
+    if(W <= WR) {
+      r.d = 0;
+      r.d.set_slc(WR-W, d);
+    } else {
+      typedef ac_std_float<WR,E> r_t;
+      const int r_mant_bits = r_t::mant_bits;
+      const int r_mu_bits = r_t::mu_bits;
+      e_t f_e = d.template slc<E>(mant_bits);
+      bool f_normal = !!f_e;
+      mu_t mu = d;
+      mu[r_mant_bits] = f_normal;
+      ac_fixed<r_mu_bits+1,mu_bits+1,false,QR> r_rnd = mu;
+      bool rnd_ovf = r_rnd[r_mu_bits];
+      ac_int<r_mant_bits,false> m_r = r_rnd.template slc<r_mant_bits>(0);
+      e_t e_r = f_e + rnd_ovf;
+      r.d = m_r;
+      r.d.set_slc(r_mant_bits, e_r);
+      r.d[WR-1] = d[W-1];
+    }
+    return r;
+  }
+
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
+    static const bool rnd = QFX!=AC_TRN && QFX!=AC_TRN_ZERO;
+    static const bool need_rnd_bit = QFX != AC_TRN;
+    static const bool need_rem_bits = need_rnd_bit && QFX != AC_RND;
+    static const bool need_ovf = OFX != AC_WRAP;
+    static const int t_width = AC_MAX(mu_bits+1, WFX+!SFX) + need_rnd_bit + need_ovf;
+
+    bool f_sign, f_normal, f_zero, f_inf, f_nan;
+    mu_t f_mu;
+    e_t f_e;
+    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
+    if(map_inf) {
+      ac_fixed<WFX,IFX,SFX,QFX,OFX> rv;
+      if(f_sign)
+        rv.template set_val<AC_VAL_MIN>();
+      else
+        rv.template set_val<AC_VAL_MAX>();
+      return rv; 
+    }
+    AC_ASSERT(!f_inf && !f_nan, "Expects finite float (not Nan or Inf)");
+    m_t f_m = f_sign ? m_t(-f_mu) : m_t(f_mu);
+    typedef ac_int<t_width,true> t_t;
+    typedef ac_int<t_width+need_rem_bits,true> t2_t;
+    t_t t = f_m;
+    t <<= need_rnd_bit;
+    static const int lsb_src = -mant_bits;
+    static const int lsb_trg = IFX-WFX;
+    int rshift = lsb_trg - lsb_src - (int)f_e;
+
+    bool sticky_bit_rnd = false;
+    bool rshift_neg = rshift < 0;
+    if(need_rem_bits) {
+      t_t shifted_out_bits = t;
+      typedef ac_int< ac::template nbits< AC_MAX(lsb_trg - lsb_src - min_exp,1) >::val, false> shift_ut;
+      shifted_out_bits &= ~(t_t(0).bit_complement() << (shift_ut) rshift);
+      sticky_bit_rnd = !!shifted_out_bits & !rshift_neg;
+    }
+    bool ovf = false;
+    if(need_ovf) {
+      t_t shifted_out_bits = t < 0 ? t_t(~t) : t;
+      // shift right by -rshift + 1
+      //   +1 is OK since added extra MSB
+      typedef ac_int< ac::template nbits< AC_MAX(-(lsb_trg - lsb_src - max_exp + 1),1) >::val, false> shift_ut;
+      shifted_out_bits &= ~((t_t(0).bit_complement() >> 2) >> (shift_ut) ~rshift);
+      ovf = !!shifted_out_bits & rshift_neg;
+    }
+
+    t >>= rshift;
+
+    t[t_width-1] = t[t_width-1] ^ (ovf & (t[t_width-1] ^ f_sign));
+    t[t_width-2] = t[t_width-2] ^ (ovf & (t[t_width-2] ^ !f_sign));
+    t2_t t2 = t;
+    if(need_rem_bits) {
+      t2 <<= 1;
+      t2[0] = t2[0] | sticky_bit_rnd;
+    }
+
+    ac_fixed<WFX,WFX+need_rnd_bit+need_rem_bits,SFX,QFX,OFX> ri = t2;
+    ac_fixed<WFX,IFX,SFX,QFX,OFX> r = 0;
+    r.set_slc(0,ri.template slc<WFX>(0));
+    return r;
+  }
+
+  template<int W2>
+  explicit ac_std_float(const ac_std_float<W2,E> &f) {
+    *this = f.template convert<W,AC_RND_CONV>();
+  }
+  template<int WR, int ER, ac_q_mode QR>
+  ac_std_float<WR,ER> convert() const {
+    ac_private::check_supported<QR>();
+    typedef ac_std_float<WR,ER> r_t;
+    typedef typename r_t::e_t r_e_t;
+    int const r_mu_bits = r_t::mu_bits;
+    int const r_mant_bits = r_t::mant_bits;
+    int const r_min_exp = r_t::min_exp;
+    int const r_max_exp = r_t::max_exp;
+    int const r_exp_bias = r_t::exp_bias;
+    bool f_sign, f_normal, f_zero, f_inf, f_nan;
+    mu_t f_mu;
+    e_t f_e;
+    r_t r;
+    extract(f_mu, f_e, f_sign, f_normal, f_zero, f_inf, f_nan);
+    int exp = f_e;
+    ac_fixed<r_mu_bits+1, mu_bits+1,false,QR> r_rnd;
+    if(ER >= E) {
+      if(ER > E && !f_normal) {
+        int ls = f_mu.leading_sign();
+        int max_shift_left = f_e - r_min_exp + 1;
+        bool shift_exponent_limited = ls >= max_shift_left;
+        int shift_l = shift_exponent_limited ? max_shift_left : ls;
+        f_mu <<= shift_l;
+        exp -= shift_l;
+      }
+      r_rnd = f_mu;
+    } else {
+      int shift_r = r_min_exp - f_e;
+      typedef ac_fixed<r_mu_bits+1,mu_bits,false> t_t;
+      t_t r_t = f_mu;
+      bool sticky_bit = !!(f_mu & ~((~mu_t(0)) << mant_bits-r_mant_bits-1));
+      if(shift_r > 0) {
+        t_t shifted_out_bits = r_t;
+        shifted_out_bits &= ~((~t_t(0)) << shift_r);
+        sticky_bit |= !!shifted_out_bits;
+        r_t >>= shift_r;
+        exp += shift_r;
+      }
+      ac_fixed<r_mu_bits+2, mu_bits,false> r_t2 = r_t;
+      r_t2[0] = sticky_bit;
+      r_rnd = r_t2;
+    }
+    bool rnd_ovf = r_rnd[r_mu_bits];
+    ac_int<r_mant_bits,false> r_m = r_rnd.template slc<r_mant_bits>(0);
+    bool r_normal = r_rnd[r_mant_bits] | rnd_ovf;
+    exp += rnd_ovf;
+    bool exception = f_inf | f_nan | (exp > r_max_exp);
+    r_e_t r_e = exception ? -1 : (f_zero | !r_normal) ? 0 : exp + r_exp_bias;
+    if(exception) {
+      r_m = 0;
+      r_m[r_mant_bits-1] = f_nan;
+    }
+    r.d = r_m;
+    r.d.set_slc(r_mant_bits, r_e);
+    r.d[WR-1] = d[W-1];
+    return r;
+  }
+  template<int W2,int E2>
+  explicit ac_std_float(const ac_std_float<W2,E2> &f) {
+    *this = f.template convert<W,E,AC_RND_CONV>();
+  }
+  template<ac_ieee_float_format Format>
+  explicit ac_std_float(const ac_ieee_float<Format> &f);
+
+  explicit ac_std_float(const ac::bfloat16 &f);
+
+  template<ac_q_mode Q>
+  explicit ac_std_float(const ac_float<mu_bits+1,2,E,Q> &f) {
+    bool sign = f.mantissa() < 0;
+    m_t m_s = f.m.template slc<mu_bits+1>(0);
+    mu1_t m_u = sign ? (mu1_t) -m_s : (mu1_t) m_s;
+    bool most_neg_m = m_u[mu_bits];
+    bool is_max_exp = f.exp() == (1 << (E-1)) - 1;
+    ac_int<E,true> e = f.exp() + exp_bias + (most_neg_m & !is_max_exp);
+    mu_t m = m_u | ac_int<1,true>(most_neg_m & is_max_exp);
+    m[mant_bits] = m[mant_bits] | most_neg_m;
+    bool exp_dont_map = !e | e==-1;
+    m >>= !e;
+    m >>= 2*(e==-1);
+    // exp_dont_map guarantees subnornal => e = 0
+    e &= ac_int<1,true>(!exp_dont_map & !!m);
+    d = m.template slc<mant_bits>(0);
+    d.set_slc(mant_bits, e);
+    d[W-1] = sign;
+  }
+  template<ac_q_mode Q, int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  void assign_from(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
+    ac_private::check_supported<Q>();
+    bool sign = fx < 0.0;
+    ac_fixed<WFX+1,2,SFX> x = 0;
+    x.set_slc(0,fx.template slc<WFX+1>(0));
+    bool all_sign;
+    int ls = x.leading_sign(all_sign);
+    int max_shift_left = IFX-1 - min_exp + 1;
+    bool shift_exponent_limited = ls >= max_shift_left;
+    int shift_l = shift_exponent_limited ? max_shift_left : ls;
+    ac_fixed<WFX+1,2,false> x_u = sign ? (ac_fixed<WFX+1,2,false>) -x :  (ac_fixed<WFX+1,2,false>) x;
+    x_u <<= shift_l;
+    int exp = IFX-1;
+    exp -= shift_l;
+    ac_fixed<mu_bits+1,2,false,Q> m_rnd = x_u;
+    mu1_t m_u = 0;  m_u.set_slc(0, m_rnd.template slc<mu_bits+1>(0));
+    bool shiftr1 = m_u[mu_bits];  // msb
+    bool r_normal = m_u[mu_bits] | m_u[mu_bits-1];
+    m_u >>= shiftr1;
+    exp += shiftr1;
+    bool fx_zero = all_sign & !sign;
+    bool r_inf = (exp > max_exp) & !fx_zero;
+    if(Q==AC_TRN_ZERO) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_u |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    e_t e = r_inf ? -1 : (!r_normal) ? 0 : exp + exp_bias;
+    m_u &= ac_int<1,true>(!r_inf);
+    e &= ac_int<1,true>(r_normal);
+    d = m_u.template slc<mant_bits>(0);
+    d.set_slc(mant_bits, e);
+    d[W-1] = sign;
+  }
+  template<ac_q_mode Q, int WI, bool SI>
+  void assign_from(const ac_int<WI,SI> &x) {
+    this->template assign_from<Q>(ac_fixed<WI,WI,SI>(x));
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  explicit ac_std_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
+    assign_from<AC_RND_CONV>(fx);
+  }
+  explicit ac_std_float(float f) {
+    const int w_bits = sizeof(f)*8;
+    const int m_bits = std::numeric_limits<float>::digits;
+    const int e_bits = w_bits - m_bits;
+    ac_int<w_bits,true> t_i;
+    ac::copy_bits(f, &t_i);
+    ac_std_float<w_bits,e_bits> t;
+    t.set_data(t_i);
+    *this = ac_std_float(t);
+  }
+  explicit ac_std_float(double f) {
+    const int w_bits = sizeof(f)*8;
+    const int m_bits = std::numeric_limits<double>::digits;
+    const int e_bits = w_bits - m_bits;
+    ac_int<w_bits,true> t_i;
+    ac::copy_bits(f, &t_i);
+    ac_std_float<w_bits,e_bits> t;
+    t.set_data(t_i);
+    *this = ac_std_float(t);
+  }
+  explicit ac_std_float(int x) {
+    *this = ac_std_float(ac_fixed<32,32,true>(x));
+  }
+  explicit ac_std_float(long long x) {
+    *this = ac_std_float(ac_fixed<64,64,true>(x));
+  }
+  const ac_int<W,true> &data() const { return d; }
+  void set_data(const ac_int<W,true> &data, bool assert_on_nan=false, bool assert_on_inf=false) {
+    d = data;
+    if(assert_on_nan)
+      AC_ASSERT(!isnan(), "Float is NaN");
+    if(assert_on_inf)
+      AC_ASSERT(!isinf(), "Float is Inf");
+  }
+  int fpclassify() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    if(e) {
+      if(e == -1)
+        return !(ac_int<mant_bits,false>)d ? FP_INFINITE : FP_NAN;
+      else
+        return FP_NORMAL;
+    }
+    else
+      return !(ac_int<mant_bits,false>)d ? FP_ZERO : FP_SUBNORMAL;
+  }
+  bool isfinite() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    return e != -1;
+  }
+  bool isnormal() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    return (e || !(ac_int<mant_bits,false>)d)&& e != -1;
+  }
+  bool isnan() const {
+    if(isfinite())
+      return false;
+    ac_int<mant_bits,false> m = d;
+    return !!m;
+  }
+  bool isinf() const {
+    if(isfinite())
+      return false;
+    ac_int<mant_bits,false> m = d;
+    return !m;
+  }
+  const ac_float<mant_bits+2,2,E,AC_TRN> to_ac_float() const {
+    ac_int<E,true> e = d.template slc<E>(mant_bits);
+    bool normal = !!e;
+    bool sign = d[W-1];
+    bool inf = e == -1;
+    ac_int<mant_bits,false> m = d;
+    ac_int<mant_bits+1,false> m1 = m;
+    m1[mant_bits] = normal;
+    ac_int<mant_bits+2,true> m_s = sign ? -m1 : (ac_int<mant_bits+2,true>) m1;
+    ac_fixed<mant_bits+2,2,true> fx = 0;
+    fx.set_slc(0, m_s);
+    e -= exp_bias;
+    // if number is subnormal, e will be MIN_EXP + 1 (10...01), but it needs to be
+    //   MIN_EXP + 2  (10...010)
+    e[0] = e[0] & normal;
+    e[1] = e[1] | !normal;
+    // normalization by at most 2 places
+    bool shiftl1 = !(fx[mant_bits+1] ^ fx[mant_bits]);
+    bool shiftl2 = shiftl1 & !(fx[mant_bits+1] ^ fx[mant_bits-1]);
+    fx <<= shiftl1;
+    fx <<= shiftl2;
+    e -= shiftl1 + shiftl2;
+    e = inf ? value<AC_VAL_MAX>(e) : e;
+    fx = inf ? (sign ? value<AC_VAL_MIN>(fx) : value<AC_VAL_MAX>(fx)) : fx;
+    return ac_float<mant_bits+2,2,E,AC_TRN>(fx, e, false);
+  }
+  float to_float() const {
+    ac_std_float<32,8> t(*this);
+    float f;
+    ac::copy_bits(t.d, &f);
+    return f;
+  }
+  double to_double() const {
+    ac_std_float<64,11> t(*this);
+    double f;
+    ac::copy_bits(t.d, &f);
+    return f;
+  }
+private:
+  void extract(mu_t &m, e_t &e, bool &sign, bool &normal, bool &zero, bool &inf, bool &nan, bool biased_exp=false, bool no_subnormals=false) const {
+    e = d.template slc<E>(mant_bits);
+    bool exception = e == -1;
+    normal = !!e | no_subnormals;
+    m = d;
+    bool m_zero = !m.template slc<mant_bits>(0);
+    zero = (!e) & (no_subnormals | m_zero);
+    m[mant_bits] = !!e;
+    if(!biased_exp) {
+      e -= exp_bias;
+      e += !normal;
+    }
+    sign = d[W-1];
+    inf = exception & m_zero;
+    nan = exception & !m_zero;
+  }
+public:
+  static ac_std_float zero() {
+    ac_std_float r;
+    r.d = 0;
+    return r;
+  }
+  static ac_std_float one() {
+    ac_std_float r;
+    r.d = 0;
+    r.d.set_slc(mant_bits, ac_int<E,false>(exp_bias));
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float add_generic(const ac_std_float &op2) const {
+    ac_private::check_supported<QR>();
+    // +1 for possible negation, +1 for bit growth due to addition
+    const int tr_t_iwidth = mu_bits + 1 + 1;
+    // extra bit for rounding, extra bit for left shift
+    const int tr_t_width = tr_t_iwidth + 1 + 1;
+    typedef ac_fixed<tr_t_width,tr_t_iwidth,true> add_t;
+    typedef ac_fixed<mu_bits,mu_bits+1,false> r_un_t;
+    e_t op1_e, op2_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan;
+    mu_t op1_mu, op2_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    m_t op1_m = op1_sign ? m_t(-op1_mu) : m_t(op1_mu);
+    op1_m &= m_t(No_SubNormals & op1_zero ? 0 : -1);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    m_t op2_m = op2_sign ? m_t(-op2_mu) : m_t(op2_mu);
+    op2_m &= m_t(No_SubNormals & op2_zero ? 0 : -1);
+
+    unsigned op1_e_b = ac_int<E,false>(op1_e) + !op1_normal;
+    unsigned op2_e_b = ac_int<E,false>(op2_e) + !op2_normal;
+    int e_dif = op1_e_b - op2_e_b;
+    bool e1_lt_e2 = e_dif < 0;
+    e_dif = (op1_zero | op2_zero) ? 0 : e1_lt_e2 ? -e_dif : e_dif;
+
+    add_t op_lshift = e1_lt_e2 ? op1_m : op2_m;
+    m_t op_no_shift = e1_lt_e2 ? op2_m : op1_m;
+    add_t shifted_out_bits = op_lshift;
+    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
+    bool sticky_bit = !!shifted_out_bits;
+
+    op_lshift >>= (unsigned) e_dif;
+    add_t add_r = op_lshift + op_no_shift;
+    int exp = ( (e1_lt_e2 & !op2_zero) | op1_zero ? op2_e_b : op1_e_b);
+    bool all_sign;
+    int ls = add_r.leading_sign(all_sign);
+    bool r_zero = !add_r[0] & all_sign;
+    // +1 to account for bit growth of add_r
+    int max_shift_left = exp + (- min_exp - exp_bias + 1);
+    bool shift_exponent_limited = ls >= max_shift_left;
+    int shift_l = shift_exponent_limited ? max_shift_left : ls;
+    add_r <<= shift_l;
+    add_r[0] = add_r[0] | sticky_bit;
+    ac_fixed<mu_bits+1,mu_bits+2,true,QR> r_rnd = add_r;
+    typedef ac_int<mu_bits+1,false> t_h;
+    t_h t = add_r.to_ac_int();
+    bool rnd_ovf = QR == AC_RND_CONV && t == t_h(-1);
+    bool r_sign = r_rnd[mu_bits] ^ rnd_ovf;
+    bool shift_r = rnd_ovf | (r_sign & !r_rnd.template slc<mu_bits>(0));
+    r_un_t r_un =  r_sign ? (r_un_t) -r_rnd : (r_un_t) r_rnd;
+    // get rid of implied bit, assign to ac_int
+    bool r_normal = r_un[mant_bits] | shift_r;
+    r_zero |= No_SubNormals & !r_normal;
+    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
+    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 1) + shift_r;
+    bool r_inf = exp > max_exp + exp_bias;
+    if(QR==AC_TRN_ZERO) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    bool r_nan = op1_nan | op2_nan | ((op1_inf & op2_inf) & (op1_sign ^ op2_sign));
+    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float add(const ac_std_float &op2) const {
+#ifndef AC_STD_FLOAT_ADD_OVERRIDE
+    return add_generic<QR,No_SubNormals>(op2);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_ADD_OVERRIDE<QR,No_SubNormals>(*this, op2);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float sub(const ac_std_float &op2) const {
+    return add<QR,No_SubNormals>(-op2);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float mult_generic(const ac_std_float &op2) const {
+    ac_private::check_supported<QR>();
+    e_t op1_e, op2_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan;
+    mu_t op1_mu, op2_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    bool r_sign = op1_sign ^ op2_sign;
+    bool r_nan = op1_nan | op2_nan | (op1_inf & op2_zero) | (op1_zero & op2_inf);
+    bool r_zero = op1_zero | op2_zero;  // r_nan takes precedence later on
+    int exp = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
+    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
+    int max_shift_left = exp + (- min_exp - exp_bias + 1);
+    int shift_l = 0;
+    bool shift_l_1 = false;
+    typedef ac_int<mu_bits+1,false> t_h;
+    typedef ac_int<mu_bits-1,false> t_l;
+    t_h p_h;
+    t_l p_l = p;
+    bool r_normal;
+    bool r_inf;
+    ac_fixed<mu_bits,mu_bits+2,false,QR> r_rnd;
+    ac_int<mant_bits,false> m_r;
+    if(max_shift_left >= 0) {
+      r_inf = exp > max_exp + exp_bias;
+      bool exp_is_max = exp == max_exp + exp_bias;
+      bool exp_is_max_m1 = exp == max_exp + exp_bias - 1;
+      unsigned ls = No_SubNormals ? 0 : (unsigned) (op1_normal ? op2_mu : op1_mu).leading_sign();
+      bool shift_exponent_limited = ls >= (unsigned) max_shift_left;
+      shift_l = shift_exponent_limited ? (unsigned) max_shift_left : ls;
+      p <<= (unsigned) shift_l;
+      exp -= shift_l;
+      shift_l_1 = !(shift_exponent_limited | p[2*mu_bits-1]);
+      p = shift_l_1 ? p << 1 : p;
+      exp += !shift_l_1;
+      p_h = p >> (mu_bits-1);
+      p_l &= (t_l(-1) >> shift_l) >> shift_l_1;
+      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
+      p_bef_rnd <<= 1;
+      p_bef_rnd[0] = !!p_l;
+      r_rnd = p_bef_rnd;
+      m_r = r_rnd.template slc<mant_bits>(0);
+      bool rnd_ovf = QR == AC_RND_CONV && p_h == t_h(-1);
+      exp += rnd_ovf;
+      r_inf |= (exp_is_max & (!shift_l_1 | rnd_ovf)) | (exp_is_max_m1 & !shift_l_1 & rnd_ovf);
+      r_normal = r_rnd[mant_bits] | rnd_ovf;
+      r_zero |= !r_normal & No_SubNormals;
+      if(QR==AC_TRN_ZERO) {
+        exp = r_inf ? max_exp + exp_bias : exp;
+        m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+        r_inf = false;
+      }
+    } else {
+      shift_l = max_shift_left;
+      exp -= shift_l;
+      unsigned shift_r_m1 = ~shift_l;
+      p_h = p >> (mu_bits-1);
+      t_h shifted_out_bits = p_h;
+      shifted_out_bits &= ~((~t_h(1)) << shift_r_m1);
+      p_h >>= shift_r_m1;
+      p_h >>= 1;
+      ac_int<mu_bits+2,false> p_bef_rnd = p_h;
+      p_bef_rnd <<= 1;
+      p_bef_rnd[0] = !!p_l | !!shifted_out_bits;
+      r_rnd = p_bef_rnd;
+      m_r = r_rnd.template slc<mant_bits>(0);
+      r_normal = false;
+      r_inf = false;
+      r_zero |= No_SubNormals;
+    }
+    bool exception = op1_inf | op2_inf | op1_nan | op2_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float mult(const ac_std_float &op2) const {
+#ifndef AC_STD_FLOAT_MULT_OVERRIDE
+    return mult_generic<QR,No_SubNormals>(op2);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_MULT_OVERRIDE<QR,No_SubNormals>(*this, op2);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float div_generic(const ac_std_float &op2) const {
+    ac_private::check_supported<QR>();
+    e_t op1_e, op2_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan;
+    mu_t op1_mu, op2_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    bool r_sign = op1_sign ^ op2_sign;
+    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
+    op1_mu <<= ls_op1;
+    int ls_op2 = No_SubNormals ? 0 : (unsigned) op2_mu.leading_sign();
+    op2_mu <<= ls_op2;
+    int exp = ac_int<E,false>(op1_e) - ac_int<E,false>(op2_e) + !op1_normal - !op2_normal - ls_op1 + ls_op2 + exp_bias;
+    ac_int<mu_bits+2,false> q0 = 0;
+    bool exact = true;
+    bool div_by_zero = op2_zero;
+#ifdef __SYNTHESIS__
+    div_by_zero = false;
+#endif
+    if(!div_by_zero) {
+      AC_STD_FLOAT_FX_DIV_OVERRIDE(op1_mu, op2_mu, q0, exact);
+    }
+    ac_int<mu_bits+3,false> q = q0;
+    q <<= 1;
+    int shift_r = min_exp + exp_bias - exp;
+    bool sticky_bit = !exact;
+    if(shift_r >= 0) {
+      typedef ac_int<mu_bits+3,false> t_t;
+      t_t shifted_out_bits = q;
+      shifted_out_bits &= ~((~t_t(0)) << shift_r);
+      sticky_bit |= !!shifted_out_bits;
+      q >>= shift_r;
+      exp += shift_r;
+    } else {
+      bool shift_l = !q[mu_bits+2];
+      q <<= shift_l;
+      exp -= shift_l;
+    }
+    q[0] = q[0] | sticky_bit;
+    ac_fixed<mu_bits+1,mu_bits+4,false,QR> r_rnd = q;
+    bool rnd_ovf = r_rnd[mu_bits];
+    ac_int<mant_bits,false> m_r = r_rnd.template slc<mant_bits>(0);
+    bool r_normal = r_rnd[mant_bits] | rnd_ovf;
+    bool r_nan = op1_nan | op2_nan | (op1_zero & op2_zero) | (op1_inf & op2_inf);
+    bool r_zero = op1_zero | op2_inf;
+    r_zero |= !r_normal & No_SubNormals;
+    exp += rnd_ovf;
+    bool r_inf0 = op1_inf | op2_zero;  // this is not affected by rounding
+    bool r_inf = (!r_zero & (exp > max_exp + exp_bias)) | r_inf0;
+    if(QR==AC_TRN_ZERO && !r_inf0) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    bool exception = r_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float div(const ac_std_float &op2) const {
+#ifndef AC_STD_FLOAT_DIV_OVERRIDE
+    return div_generic<QR,No_SubNormals>(op2);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_DIV_OVERRIDE<QR,No_SubNormals>(*this, op2);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float fma_generic(const ac_std_float &op2, const ac_std_float &op3) const {
+    ac_private::check_supported<QR>();
+    e_t op1_e, op2_e, op3_e;
+    bool op1_normal, op1_sign, op1_zero, op2_normal, op2_sign, op2_zero, op3_normal, op3_sign, op3_zero;
+    bool op1_inf, op1_nan, op2_inf, op2_nan, op3_inf, op3_nan;
+    mu_t op1_mu, op2_mu, op3_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    op2.extract(op2_mu, op2_e, op2_sign, op2_normal, op2_zero, op2_inf, op2_nan, true, No_SubNormals);
+    op3.extract(op3_mu, op3_e, op3_sign, op3_normal, op3_zero, op3_inf, op3_nan, true, No_SubNormals);
+    if(No_SubNormals)
+      op3_mu &= mu_t(op3_zero ? 0 : -1);
+    bool mult_sign = (op1_sign ^ op2_sign) | (op1_zero & op2_inf) | (op1_inf & op1_zero);
+    bool mult_nan = op1_nan | op2_nan | (op1_zero & op2_inf) | (op1_inf & op2_zero);
+    bool mult_zero = op1_zero | op2_zero;  // mult_nan has precedence later on
+    int mult_exp_b = ac_int<E,false>(op1_e) + ac_int<E,false>(op2_e) + !op1_normal + !op2_normal - exp_bias;
+    mult_exp_b |= ac_int<E,false>( op1_inf | op2_inf ? -1 : 0 );
+    ac_int<2*mu_bits,false> p = op1_mu * op2_mu;
+    if(No_SubNormals)
+      p &= ac_int<2*mu_bits,false>(mult_zero ? 0 : -1);
+    bool mult_inf = op1_inf | op2_inf;
+
+    bool diff_signs = mult_sign ^ op3_sign;
+    bool toggle_r_sign = mult_sign;
+    m_t op3_m = diff_signs ? m_t(-op3_mu) : m_t(op3_mu);
+    unsigned op3_e_b = ac_int<E,false>(op3_e) + !op3_normal;
+
+    int e_dif = mult_exp_b - op3_e_b;
+    bool emult_lt_e3 = e_dif < 0;
+    e_dif = (mult_zero | op3_zero) ? 0 : emult_lt_e3 ? -e_dif : e_dif;
+
+    typedef ac_int<2*mu_bits+4,true> add_t;
+    add_t op3_m_s = op3_m;
+    op3_m_s <<= mu_bits+1;   // mult: ii.ffff, op3: i.ff
+    add_t p_s = p;
+    p_s <<= 2;
+    add_t op_lshift = emult_lt_e3 ? p_s : op3_m_s;
+    add_t op_no_shift = emult_lt_e3 ? op3_m_s : p_s;
+
+    add_t shifted_out_bits = op_lshift;
+    shifted_out_bits &= ~((~add_t(0)) << (unsigned) e_dif);
+    bool sticky_bit = !!shifted_out_bits;
+
+    op_lshift >>= (unsigned) e_dif;
+    add_t add_r = op_lshift + op_no_shift;
+    int exp = ( (emult_lt_e3 & !op3_zero) | mult_zero ? op3_e_b : mult_exp_b);
+
+    bool all_sign;
+    int ls = add_r.leading_sign(all_sign);
+    // no bit growth of add_r
+    int max_shift_left = exp + (- min_exp - exp_bias + 2);
+    bool shift_exponent_limited = ls >= max_shift_left;
+    int shift_l = shift_exponent_limited ? max_shift_left : ls;
+    add_r <<= shift_l;
+    add_r[0] = add_r[0] | sticky_bit;
+
+    ac_fixed<mu_bits+1,2*mu_bits+4,true,QR> r_rnd = add_r;
+
+    typedef ac_int<mu_bits+1,false> t_h;
+    t_h t = add_r.template slc<mu_bits+1>(mu_bits+2);
+    bool rnd_ovf = QR == AC_RND_CONV && !add_r[2*mu_bits+3] && t == t_h(-1);
+    bool r_neg = r_rnd[mu_bits] ^ rnd_ovf;
+    bool r_sign = op3_inf ? op3_sign : mult_inf ? mult_sign : r_neg ^ toggle_r_sign;
+    ac_int<mu_bits+1,true> r_rnd_i = r_rnd.template slc<mu_bits+1>(0);
+    bool r_zero = !rnd_ovf & !r_rnd_i;
+    bool shift_r = rnd_ovf | (r_neg & !r_rnd_i.template slc<mu_bits>(0));
+    typedef ac_int<mu_bits,false> r_un_t;
+    r_un_t r_un =  r_neg ? (r_un_t) -r_rnd_i : (r_un_t) r_rnd_i;
+    // get rid of implied bit, assign to ac_int
+    bool r_normal = r_un[mant_bits] | shift_r;
+    r_zero |= No_SubNormals & !r_normal;
+    ac_int<mant_bits,false> m_r = r_un.template slc<mant_bits>(0);
+    exp = (shift_exponent_limited ? min_exp + exp_bias : exp - ls + 2) + shift_r;
+    bool r_inf = mult_inf | op3_inf | (exp > max_exp + exp_bias);
+    if(QR==AC_TRN_ZERO) {
+      exp = r_inf ? max_exp + exp_bias : exp;
+      m_r |= ac_int<1,true>(-r_inf);  // saturate (set all bits to 1) if r_inf
+      r_inf = false;
+    }
+    bool r_nan = op3_nan | mult_nan | ((op3_inf & (op1_inf | op2_inf)) & (op3_sign ^ mult_sign));
+    bool exception = op3_inf | mult_inf | op3_nan | mult_nan | r_inf;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    d_r[W-1] = r_sign;
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float fma(const ac_std_float &op2, const ac_std_float &op3) const {
+#ifndef AC_STD_FLOAT_FMA_OVERRIDE
+    return fma_generic<QR,No_SubNormals>(op2,op3);
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_FMA_OVERRIDE<QR,No_SubNormals>(*this,op2,op3);
+#endif
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float sqrt_generic() const {
+    ac_private::check_supported<QR>();
+    const bool rnd = QR != AC_TRN_ZERO;   // need msb(rounded bits)
+    const bool rbits = QR != AC_TRN_ZERO; // need bits after msb(rounded bits)
+    e_t op1_e;
+    bool op1_normal, op1_sign, op1_zero;
+    bool op1_inf, op1_nan;
+    mu_t op1_mu;
+    extract(op1_mu, op1_e, op1_sign, op1_normal, op1_zero, op1_inf, op1_nan, true, No_SubNormals);
+    int ls_op1 = No_SubNormals ? 0 : (unsigned) op1_mu.leading_sign();
+    op1_mu <<= ls_op1;
+    op1_mu[mu_bits-1] = true;  // Since it is normalized, zero is captured by op1_zero
+
+    bool exp_odd = (op1_e  ^ !op1_normal ^ ls_op1 ^ exp_bias) & 1;
+
+    int exp = ac_int<E,false>(op1_e) + !op1_normal - ls_op1 - exp_bias;
+    exp >>= 1;   // divide by 2, truncate towards -inf
+
+    ac_int<mu_bits+1,false> op1_mi = op1_mu;
+    op1_mi <<= exp_odd;
+    ac_int<mu_bits+rnd,false> sq_rt;
+    bool sticky_bit = ac::fx_sqrt(op1_mi, sq_rt);
+    bool r_normal = true;  // true for most practical cases on W,E
+    if(mant_bits > -min_exp) {
+      int exp_over = min_exp - exp;
+      if(exp_over > 0) {
+        if(rbits) {
+          typedef ac_int<mu_bits+rnd,false> t_t;
+          t_t shifted_out_bits = sq_rt;
+          shifted_out_bits &= ~((~t_t(0)) << exp_over);
+          sticky_bit |= !!shifted_out_bits;
+        }
+        sq_rt >>= exp_over;
+        exp = min_exp;
+        r_normal = false;
+      }
+    }
+    // rounding should not trigger overflow (unless truncate towards +inf which is currently not supported)
+    ac_fixed<mu_bits+rnd+rbits,1,false> sq_rt_rnd = 0;
+    if(rbits)
+      sq_rt_rnd[0] = sq_rt_rnd[0] | sticky_bit;
+    sq_rt_rnd.set_slc(rbits, sq_rt);
+    ac_fixed<mu_bits,1,false,QR> sq_rt_fx = sq_rt_rnd;
+
+    ac_int<mant_bits,false> m_r = sq_rt_fx.template slc<mant_bits>(0);
+    bool r_nan = op1_nan | (op1_sign & !op1_zero);
+    bool r_zero = op1_zero;
+    r_zero |= !r_normal & No_SubNormals;
+    bool r_inf = op1_inf;
+    bool exception = r_nan | r_inf;
+    exp += exp_bias;
+    ac_int<E,true> e_r = exception ? -1 : (r_zero | !r_normal) ? 0 : exp;
+    if(exception | r_zero) {
+      m_r = 0;
+      m_r[mant_bits-1] = r_nan;
+    }
+    ac_int<W,true> d_r = m_r;
+    d_r.set_slc(mant_bits, e_r);
+    ac_std_float r;
+    r.set_data(d_r);
+    return r;
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_std_float sqrt() const {
+#ifndef AC_STD_FLOAT_SQRT_OVERRIDE
+    return sqrt_generic<QR,No_SubNormals>();
+#else
+    return AC_STD_FLOAT_OVERRIDE_NS AC_STD_FLOAT_SQRT_OVERRIDE<QR,No_SubNormals>(*this);
+#endif
+  }
+  ac_std_float operator +(const ac_std_float &op2) const {
+    return add<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float operator -(const ac_std_float &op2) const {
+    return sub<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float operator *(const ac_std_float &op2) const {
+    return mult<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float operator /(const ac_std_float &op2) const {
+    return div<AC_RND_CONV,false>(op2);
+  }
+  ac_std_float &operator +=(const ac_std_float &op2) {
+    *this = operator +(op2);
+    return *this;
+  }
+  ac_std_float &operator -=(const ac_std_float &op2) {
+    *this = operator -(op2);
+    return *this;
+  }
+  ac_std_float &operator *=(const ac_std_float &op2) {
+    *this = operator *(op2);
+  }
+  ac_std_float &operator /=(const ac_std_float &op2) {
+    *this = operator /(op2);
+    return *this;
+  }
+  bool operator ==(const ac_std_float &op2) const {
+    return ((d == op2.d) && !isnan()) || (operator !() && op2.operator !());
+  }
+  bool operator !=(const ac_std_float &op2) const {
+    return !operator ==(op2);
+  }
+  bool magnitude_lt(const ac_std_float &op2) const {
+    return ac_int<W-1,false>(d) < ac_int<W-1,false>(op2.d);
+  }
+  bool neg() const { return d[W-1]; }
+  bool operator <(const ac_std_float &op2) const {
+    return
+      operator !=(op2) && ( (neg() && !op2.neg()) || (!(neg() ^ op2.neg()) && neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator >=(const ac_std_float &op2) const {
+    return
+      (operator ==(op2) || (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator >(const ac_std_float &op2) const {
+    return
+      operator !=(op2)
+      && ( (!neg() && op2.neg()) || (!(neg() ^ op2.neg()) && !neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator <=(const ac_std_float &op2) const {
+    return
+      (operator == (op2) || (neg() && !op2.neg()) || (!neg() ^ op2.neg() && neg() ^ magnitude_lt(op2)) )
+      && !isnan() && !op2.isnan();
+  }
+  bool operator !() const { return !ac_int<W-1,false>(d); }
+  ac_std_float operator -() const {
+    ac_std_float r(*this);
+    r.d[W-1] = !d[W-1];
+    return r;
+  }
+  ac_std_float operator +() const {
+    return ac_std_float(*this);
+  }
+  ac_std_float abs() const {
+    ac_std_float r(*this);
+    r.d[W-1] = false;
+    return r;
+  }
+  ac_std_float copysign(const ac_std_float &op2) const {
+    ac_std_float r(*this);
+    r.d[W-1] = op2.d[W-1];
+    return r;
+  }
+  bool signbit() const {
+    return d[W-1];
+  }
+  void set_signbit(bool s) {
+    d[W-1] = s;
+  }
+  ac_std_float ceil() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    bool sign = d[W-1];
+    if(!d.template slc<W-1>(0))
+      return *this;
+    if(e < exp_bias) {
+      return sign ? zero() : one();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        bool non_zero_fractional = !!(m & ~mask);
+        if(!sign) {
+          m |= ~mask;
+          mu_t mu = m + mant_t(non_zero_fractional);
+          e += mu[mant_bits];
+          r.d.set_slc(mant_bits, e);
+          m = mu;
+        }
+        m &= mask;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+  ac_std_float floor() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    bool sign = d[W-1];
+    if(!d.template slc<W-1>(0))
+      return *this;
+    if(e < exp_bias) {
+      return sign ? -one() : zero();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        bool non_zero_fractional = !!(m & ~mask);
+        if(sign) {
+          m |= ~mask;
+          mu_t mu = m + mant_t(non_zero_fractional);
+          e += mu[mant_bits];
+          r.d.set_slc(mant_bits, e);
+          m = mu;
+        }
+        m &= mask;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+  ac_std_float trunc() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    if(e < exp_bias) {
+      return zero();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        m &= mask;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+  ac_std_float round() const {
+    ac_int<E,false> e = d.template slc<E>(mant_bits);
+    if(e < exp_bias-1) {
+      return zero();
+    } else {
+      ac_std_float r(*this);
+      int e_dif = mant_bits + exp_bias -1 - e;
+      if((e_dif < 0) | (e == ac_int<E,false>(-1)))
+        return r;
+      else {
+        typedef ac_int<mant_bits,false> mant_t;
+        mant_t m = d;
+        mant_t mask = (~mant_t(0)) << e_dif;
+        m |= ~mask;
+        mu_t mu = m + mant_t(1);
+        e += mu[mant_bits];
+        r.d.set_slc(mant_bits, e);
+        m = mu;
+        m &= mask << 1;  // truncate fractional bits
+        r.d.set_slc(0, m);
+        return r;
+      }
+    }
+  }
+};
+
+template<int W, int E>
+inline std::ostream& operator << (std::ostream &os, const ac_std_float<W,E> &x) {
+  // for now just print the raw ac_int for it
+  os << x.data().to_string(AC_HEX);
+  return os;
+}
+
+namespace ac {
+  // Type punning: using memcpy to avoid strict aliasing
+  inline void copy_bits(float f, int *x) {
+    std::memcpy(x, &f, sizeof(int));
+  }
+  inline void copy_bits(double f, long long *x) {
+    std::memcpy(x, &f, sizeof(long long));
+  }
+  inline void copy_bits(int x, float *f) {
+    std::memcpy(f, &x, sizeof(float));
+  }
+  inline void copy_bits(long long x, double *f) {
+    std::memcpy(f, &x, sizeof(double));
+  }
+
+  inline void copy_bits(const ac_std_float<32,8> &x, float *f) {
+    copy_bits(x.data().to_int(), f);
+  }
+  inline void copy_bits(const ac_std_float<64,11> &x, double *f) {
+    copy_bits(x.data().to_int64(), f);
+  }
+}
+
+template<ac_ieee_float_format Format>
+class ac_ieee_float_base {
+public:
+  static const int width = 1 << ((int)Format + 4);
+  // exponents are {5,8,11,15,19}, but the first three are specialized elsewhere
+  static const int e_width = 11 + ((int)Format - binary64)*4; // 11, 15, 19
+  static const int lls = width >> 6;
+  typedef long long (data_t)[lls];
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+  typedef ac_std_float<width,e_width> helper_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) {
+    ac::copy_bits(f.d, &d);
+  }
+  explicit ac_ieee_float_base(const helper_t &op) {
+    ac::copy_bits(op.data(), &d);
+  }
+  explicit ac_ieee_float_base(double f);
+protected:
+  helper_t to_helper_t() const {
+    ac_int<width,true> dat;
+    ac::copy_bits(d, &dat);
+    helper_t x;
+    x.set_data(dat);
+    return x;
+  }
+public:
+  void set_data(const data_t &op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const { return d[lls-1] < 0; }
+  void set_signbit(bool s) {
+    ac_int<64,true> t(d[lls-1]);
+    t[63] = s;
+    d[lls-1] = t.to_int64();
+  }
+};
+
+template<ac_ieee_float_format Format>
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<Format> &x) {
+  // for now print the 128 and 256 as raw ac_int
+  os << x.data_ac_int().to_string(AC_HEX);
+  return os;
+}
+
+template<> class ac_ieee_float_base<binary16> {
+public:
+  static const int width = 16;
+  static const int e_width = 5;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+  typedef short data_t;
+  typedef ac_std_float<width,e_width> helper_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
+  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
+  explicit ac_ieee_float_base(float f) : d((short)ac_std_float<width,e_width>(f).data().to_int()) {}
+protected:
+  helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+  float to_float() const {
+    ac_std_float_t t;
+    t.set_data(this->data_ac_int());
+    return t.to_float();
+  }
+#if __cplusplus > 199711L
+  explicit operator float() const { return to_float(); }
+#endif
+  void set_data(short op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const { return d < 0; }
+  void set_signbit(bool s) {
+    ac_int<width,true> t(d);
+    t[width-1] = s;
+    d = t;
+  }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary16> &x) {
+  os << x.to_float();
+  return os;
+}
+
+struct float_helper {
+  float d;
+  float_helper() {}
+  float_helper(float f) { d = f; }
+  float_helper(const float_helper &f) { d = f.d; }
+  float_helper(const float_helper &f, bool no_subnormals) {
+    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
+  }
+  float_helper(const ac_std_float<32,8> &f) { set_data(f.data().to_int()); }
+  template<ac_q_mode Q>
+  float_helper(const ac_float<25,2,8,Q> &f) : d(f.to_float()) {}
+  const float &data() const { return d; }
+  void set_data(int data) { ac::copy_bits(data, &d); }
+  void set_data(float data) { d = data; }
+  operator float() const { return d; }
+  float to_float() const { return d; }
+  int fpclassify() const { return std::fpclassify(d); }
+  bool isfinite() const { return std::isfinite(d); }
+  bool isnormal() const { return std::isnormal(d); }
+  bool isinf() const { return std::isinf(d); }
+  bool isnan() const { return std::isnan(d); }
+  static float nan() { return ac_std_float<32,8>::nan().to_float(); }
+  static float inf() { return ac_std_float<32,8>::inf().to_float(); }
+  static float denorm_min() { return ac_std_float<32,8>::denorm_min().to_float(); }
+  static float min() { return ac_std_float<32,8>::min().to_float(); }
+  static float max() { return ac_std_float<32,8>::max().to_float(); }
+  static float epsilon() { return ac_std_float<32,8>::epsilon().to_float(); }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper add(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) + float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper sub(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) - float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper mult(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) * float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper div(const float_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( float_helper(*this, No_SubNormals) / float_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper fma(const float_helper &op2, const float_helper &op3) const {
+    ac_private::check_supported2<QR>();
+    return float_helper( ::fmaf(float_helper(*this, No_SubNormals), float_helper(op2, No_SubNormals), float_helper(op3, No_SubNormals)), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  float_helper sqrt() const {
+    ac_private::check_supported2<QR>();
+    return float_helper( ::sqrtf(float_helper(*this, No_SubNormals)), No_SubNormals);
+  }
+  float_helper ceil() const { return float_helper(std::ceil(d)); }
+  float_helper floor() const { return float_helper(std::floor(d)); }
+  float_helper trunc() const { return float_helper(::truncf(d)); }
+  float_helper round() const { return float_helper(::roundf(d)); }
+};
+
+template<> class ac_ieee_float_base<binary32> {
+public:
+  static const int width = 32;
+  static const int e_width = 8;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+  typedef float data_t;
+  typedef float_helper helper_t;
+#else
+  typedef int data_t;
+  typedef ac_std_float<width,e_width> helper_t;
+#endif
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
+  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
+  explicit ac_ieee_float_base(float f) { ac::copy_bits(f, &d); }
+protected:
+  helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+#if __cplusplus > 199711L
+  explicit operator float() const {
+    float f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+#endif
+  float to_float() const {
+    float f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+  void set_data(int op) { ac::copy_bits(op, &d); }
+  void set_data(float op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const {
+    int x; ac::copy_bits(d, &x);
+    return x < 0;
+  }
+  void set_signbit(bool s) {
+    ac_int<width,true> t;
+    ac::copy_bits(d, &t);
+    t[width-1] = s;
+    ac::copy_bits(t, &d);
+  }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary32> &x) {
+  os << x.to_float();
+  return os;
+}
+
+struct double_helper {
+  double d;
+  double_helper() {}
+  double_helper(double f) { d = f; }
+  double_helper(const float_helper &f) { d = f.d; }
+  double_helper(const double_helper &f, bool no_subnormals) {
+    d = no_subnormals && f.fpclassify() == FP_SUBNORMAL ? std::signbit(f.d) ? -0.0 : 0.0 : f.d;
+  }
+  double_helper(const ac_std_float<64,11> &f) { set_data(f.data().to_int64()); }
+  template<ac_q_mode Q>
+  double_helper(const ac_float<54,2,11,Q> &f) : d(f.to_double()) {}
+  const double &data() const { return d; }
+  void set_data(long long data) {
+    ac::copy_bits(data, &d);
+  }
+  void set_data(double data) { d = data; }
+  operator double() const { return d; }
+  double to_double() const { return d; }
+  int fpclassify() const { return std::fpclassify(d); }
+  bool isfinite() const { return std::isfinite(d); }
+  bool isnormal() const { return std::isnormal(d); }
+  bool isinf() const { return std::isinf(d); }
+  bool isnan() const { return std::isnan(d); }
+  static double nan() { return ac_std_float<64,11>::nan().to_double(); }
+  static double inf() { return ac_std_float<64,11>::inf().to_double(); }
+  static double denorm_min() { return ac_std_float<64,11>::denorm_min().to_double(); }
+  static double min() { return ac_std_float<64,11>::min().to_double(); }
+  static double max() { return ac_std_float<64,11>::max().to_double(); }
+  static double epsilon() { return ac_std_float<64,11>::epsilon().to_double(); }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper add(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) + double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper sub(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) - double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper mult(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) * double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper div(const double_helper &op2) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( double_helper(*this, No_SubNormals) / double_helper(op2, No_SubNormals), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper fma(const double_helper &op2, const double_helper &op3) const {
+    ac_private::check_supported2<QR>();
+    return double_helper( ::fma((double) double_helper(*this, No_SubNormals), (double) double_helper(op2, No_SubNormals), (double) double_helper(op3, No_SubNormals)), No_SubNormals);
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  double_helper sqrt() const {
+    ac_private::check_supported2<QR>();
+    return double_helper( ::sqrt((double) double_helper(*this, No_SubNormals)), No_SubNormals);
+  }
+  double_helper ceil() const { return double_helper(std::ceil(d)); }
+  double_helper floor() const { return double_helper(std::floor(d)); }
+  double_helper trunc() const { return double_helper(::trunc(d)); }
+  double_helper round() const { return double_helper(::round(d)); }
+};
+
+template<> class ac_ieee_float_base<binary64> {
+public:
+  static const int width = 64;
+  static const int e_width = 11;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+#ifdef AC_IEEE_FLOAT_USE_BUILTIN
+  typedef double data_t;
+  typedef double_helper helper_t;
+#else
+  typedef long long data_t;
+  typedef ac_std_float<width,e_width> helper_t;
+#endif
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  data_t d;
+  ac_ieee_float_base() {}
+  ac_ieee_float_base(const ac_ieee_float_base &f) : d(f.d) {}
+  explicit ac_ieee_float_base(const helper_t &op) : d(op.data()) {}
+  explicit ac_ieee_float_base(double f) { ac::copy_bits(f, &d); }
+protected:
+  helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+#if __cplusplus > 199711L
+  explicit operator double() const {
+    double f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+#endif
+  double to_double() const {
+    double f;
+    ac::copy_bits(d, &f);
+    return f;
+  }
+  void set_data(long long op) { ac::copy_bits(op, &d); }
+  void set_data(double op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<width,true> data_ac_int() const {
+    ac_int<width,true> x;
+    ac::copy_bits(d, &x);
+    return x;
+  }
+  bool signbit() const {
+    long long x; ac::copy_bits(d, &x);
+    return x < 0;
+  }
+  void set_signbit(bool s) {
+    ac_int<width,true> t;
+    ac::copy_bits(d, &t);
+    t[width-1] = s;
+    ac::copy_bits(t, &d);
+  }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float_base<binary64> &x) {
+  os << x.to_double();
+  return os;
+}
+
+namespace ac_private {
+  template<ac_ieee_float_format Format, typename T2>
+  struct ac_ieee_float_constructor {};
+  template<> struct ac_ieee_float_constructor<binary16,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary16,double> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary32,float> {
+    typedef int type;
+  };
+  template<> struct ac_ieee_float_constructor<binary32,double> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary64,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary64,double> {
+    typedef int type;
+  };
+  template<> struct ac_ieee_float_constructor<binary128,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary128,double> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary256,float> {
+    typedef int type_explicit;
+  };
+  template<> struct ac_ieee_float_constructor<binary256,double> {
+    typedef int type_explicit;
+  };
+}
+
+template<ac_ieee_float_format Format>
+class ac_ieee_float : public ac_ieee_float_base<Format> {
+public:
+  typedef ac_ieee_float_base<Format> Base;
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type mult;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type plus;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type minus2;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type logic;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div;
+    typedef typename ac_private::rt_closed_T<ac_ieee_float,T>::type div2;
+  };
+  struct rt_unary {
+    typedef ac_ieee_float neg;
+    typedef ac_ieee_float mag_sqr;
+    typedef ac_ieee_float mag;
+  };
+  static const int width = Base::width;
+  static const int e_width = Base::e_width;
+  static const int lls = width >> 6;
+  typedef typename Base::data_t data_t;
+  typedef typename Base::helper_t helper_t;
+  typedef typename Base::ac_float_t ac_float_t;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+public:
+  static ac_ieee_float nan() { return ac_ieee_float(helper_t::nan()); }
+  static ac_ieee_float inf() { return ac_ieee_float(helper_t::inf()); }
+  static ac_ieee_float denorm_min() { return ac_ieee_float(helper_t::denorm_min()); }
+  static ac_ieee_float min() { return ac_ieee_float(helper_t::min()); }
+  static ac_ieee_float max() { return ac_ieee_float(helper_t::max()); }
+  static ac_ieee_float epsilon() { return ac_ieee_float(helper_t::epsilon()); }
+  static ac_ieee_float zero() { return ac_ieee_float(ac_std_float_t::zero()); }
+  static ac_ieee_float one() { return ac_ieee_float(ac_std_float_t::one()); }
+  ac_ieee_float() {}
+private:
+  ac_ieee_float(const Base &f) : Base(f) {}
+public:
+  ac_ieee_float(const ac_std_float<width,e_width> &f) : Base(f) {}
+  ac_ieee_float(const ac_ieee_float &f) : Base(f) {}
+  template<ac_ieee_float_format Format2>
+  explicit ac_ieee_float(const ac_ieee_float<Format2> &f) : Base(ac_std_float_t(f.to_ac_std_float())) {}
+  template<int W, int E>
+  explicit ac_ieee_float(const ac_std_float<W,E> &f) : Base(ac_std_float_t(f)) {}
+  explicit ac_ieee_float(const ac::bfloat16 &f);
+  explicit ac_ieee_float(const ac_float_t &f) : Base(ac_std_float_t(f)) {}
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  explicit ac_ieee_float(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) : Base(ac_std_float_t(fx)) {}
+  template<ac_q_mode Q>
+  explicit ac_ieee_float(const ac_float<width-e_width+1,2,e_width,Q> &f) : Base(ac_std_float_t(f)) {}
+  template<ac_ieee_float_format Format2>
+  ac_ieee_float<Format2> to_ac_ieee_float() const { return ac_ieee_float<Format2>(*this); }
+  const ac_float_t to_ac_float() const {
+    return to_ac_std_float().to_ac_float();
+  }
+  const ac_std_float<width,e_width> to_ac_std_float() const {
+    ac_std_float<width,e_width> r;
+    r.set_data(data_ac_int());
+    return r;
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
+    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
+  }
+  void set_data(const data_t &data) {
+    Base::set_data(data);
+  }
+  const ac_int<width,true> data_ac_int() const { return Base::data_ac_int(); }
+  const data_t &data() const { return Base::d; }
+  template<typename T>
+  ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type d = 0) : Base(ac_std_float_t(f)) {}
+  template<typename T>
+  explicit ac_ieee_float(const T &f, typename ac_private::template ac_ieee_float_constructor<Format,T>::type_explicit d = 0) : Base(ac_std_float_t(f)) {}
+  explicit ac_ieee_float(int x) {
+    *this = ac_ieee_float(ac_fixed<32,32,true>(x));
+  }
+  explicit ac_ieee_float(long long x) {
+    *this = ac_ieee_float(ac_fixed<64,64,true>(x));
+  }
+  int fpclassify() const { return Base::to_helper_t().fpclassify(); }
+  bool isfinite() const { return Base::to_helper_t().isfinite(); }
+  bool isnormal() const { return Base::to_helper_t().isnormal(); }
+  bool isinf() const { return Base::to_helper_t().isinf(); }
+  bool isnan() const { return Base::to_helper_t().isnan(); }
+
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float add(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template add<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float sub(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template sub<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float mult(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template mult<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float div(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template div<QR,No_SubNormals>(op2.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float fma(const ac_ieee_float &op2, const ac_ieee_float &op3) const {
+    return ac_ieee_float(Base(Base::to_helper_t().template fma<QR,No_SubNormals>(op2.Base::to_helper_t(), op3.Base::to_helper_t())));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  ac_ieee_float sqrt() const {
+    return ac_ieee_float(Base(Base::to_helper_t().template sqrt<QR,No_SubNormals>()));
+  }
+
+  ac_ieee_float operator +(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() + op2.Base::to_helper_t()));
+  }
+  ac_ieee_float operator -(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() - op2.Base::to_helper_t()));
+  }
+  ac_ieee_float operator *(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() * op2.Base::to_helper_t()));
+  }
+  ac_ieee_float operator /(const ac_ieee_float &op2) const {
+    return ac_ieee_float(Base(Base::to_helper_t() / op2.Base::to_helper_t()));
+  }
+
+  ac_ieee_float &operator +=(const ac_ieee_float &op2) {
+    return *this = operator +(op2);
+  }
+  ac_ieee_float &operator -=(const ac_ieee_float &op2) {
+    return *this = operator -(op2);
+  }
+  ac_ieee_float &operator *=(const ac_ieee_float &op2) {
+    return *this = operator *(op2);
+  }
+  ac_ieee_float &operator /=(const ac_ieee_float &op2) {
+    return *this = operator /(op2);
+  }
+
+  bool operator ==(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() == op2.Base::to_helper_t();
+  }
+  bool operator !=(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() != op2.Base::to_helper_t();
+  }
+  bool operator <(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() < op2.Base::to_helper_t();
+  }
+  bool operator >=(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() >= op2.Base::to_helper_t();
+  }
+  bool operator >(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() > op2.Base::to_helper_t();
+  }
+  bool operator <=(const ac_ieee_float &op2) const {
+    return Base::to_helper_t() <= op2.Base::to_helper_t();
+  }
+
+  ac_ieee_float operator -() const {
+    ac_ieee_float r(*this);
+    r.set_signbit(!this->signbit());
+    return r;
+  }
+  ac_ieee_float operator +() const {
+    return ac_ieee_float(*this);
+  }
+  ac_ieee_float abs() const {
+    ac_ieee_float r(*this);
+    r.set_signbit(false);
+    return r;
+  }
+  ac_ieee_float copysign(const ac_ieee_float &op2) const {
+    ac_ieee_float r(*this);
+    r.set_signbit(this->signbit());
+    return r;
+  }
+  bool signbit() const { return Base::signbit(); }
+  ac_ieee_float add(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 + op2;
+  }
+  ac_ieee_float ceil() const {
+    return ac_ieee_float(Base(Base::to_helper_t().ceil()));
+  }
+  ac_ieee_float floor() const {
+    return ac_ieee_float(Base(Base::to_helper_t().floor()));
+  }
+  ac_ieee_float trunc() const {
+    return ac_ieee_float(Base(Base::to_helper_t().trunc()));
+  }
+  ac_ieee_float round() const {
+    return ac_ieee_float(Base(Base::to_helper_t().round()));
+  }
+  ac_ieee_float sub(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 - op2;
+  }
+  ac_ieee_float mult(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 * op2;
+  }
+  ac_ieee_float div(const ac_ieee_float &op1, const ac_ieee_float &op2) {
+    return *this = op1 / op2;
+  }
+};
+
+template<ac_ieee_float_format Format>
+inline std::ostream& operator << (std::ostream &os, const ac_ieee_float<Format> &x) {
+  os << (const ac_ieee_float_base<Format>&) x;
+  return os;
+}
+
+namespace ac {
+class bfloat16 {
+public:
+  template<typename T>
+  struct rt_T {
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type mult;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type plus;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type minus2;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type logic;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div;
+    typedef typename ac_private::rt_closed_T<bfloat16,T>::type div2;
+  };
+  struct rt_unary {
+    typedef bfloat16 neg;
+    typedef bfloat16 mag_sqr;
+    typedef bfloat16 mag;
+  };
+  static const int width = 16;
+  static const int e_width = 8;
+  static bfloat16 nan() { return bfloat16(helper_t::nan()); }
+  static bfloat16 inf() { return bfloat16(helper_t::inf()); }
+  static bfloat16 denorm_min() { return bfloat16(helper_t::denorm_min()); }
+  static bfloat16 min() { return bfloat16(helper_t::min()); }
+  static bfloat16 max() { return bfloat16(helper_t::max()); }
+  static bfloat16 epsilon() { return bfloat16(helper_t::epsilon()); }
+  static bfloat16 zero() { return bfloat16(ac_std_float_t::zero()); }
+  static bfloat16 one() { return bfloat16(ac_std_float_t::one()); }
+  typedef ac_std_float<width,e_width> helper_t;
+  typedef short data_t;
+  typedef ac_float<width-e_width+1,2,e_width,AC_RND_CONV> ac_float_t;
+  typedef ac_std_float<width,e_width> ac_std_float_t;
+  data_t d;
+  bfloat16() {}
+  bfloat16(const bfloat16 &f) : d(f.d) {}
+  bfloat16(const ac_std_float_t &op) : d(op.data()) {}
+  bfloat16(float f) { int x; ac::copy_bits(f, &x); d = (short) (x >> 16); }
+  template<int W2>
+  explicit bfloat16(const ac_std_float<W2,e_width> &f) {
+    *this = f.template convert<width,AC_TRN_ZERO>();
+  }
+  template<int W2,int E2>
+  explicit bfloat16(const ac_std_float<W2,E2> &f) {
+    *this = f.template convert<width,e_width,AC_TRN_ZERO>();
+  }
+  template<ac_ieee_float_format Format>
+  explicit bfloat16(const ac_ieee_float<Format> &f) {
+    *this = f.to_ac_std_float().template convert<width,e_width,AC_TRN_ZERO>();
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  explicit bfloat16(const ac_fixed<WFX,IFX,SFX,QFX,OFX> &fx) {
+    ac_std_float_t x;
+    x.assign_from<AC_TRN_ZERO>(fx);
+    *this = x;
+  }
+private:
+  const helper_t to_helper_t() const {
+    helper_t x;
+    x.set_data(d);
+    return x;
+  }
+public:
+  const ac_std_float_t to_ac_std_float() const {
+    ac_std_float_t x;
+    x.set_data(d);
+    return x;
+  }
+  const ac_float_t to_ac_float() const {
+    return ac_std_float_t().to_ac_float();
+  }
+  template<int WFX, int IFX, bool SFX, ac_q_mode QFX, ac_o_mode OFX>
+  ac_fixed<WFX,IFX,SFX,QFX,OFX> convert_to_ac_fixed(bool map_inf=false) const {
+    return to_ac_std_float().template convert_to_ac_fixed<WFX,IFX,SFX,QFX,OFX>(map_inf);
+  }
+  float to_float() const {
+    return to_ac_std_float().to_float();
+  }
+  double to_double() const {
+    return to_ac_std_float().to_double();
+  }
+  // operator is efficient since E is identical and mantissa is longer
+#if __cplusplus > 199711L
+  explicit operator float() const { return to_float(); }
+#endif
+  int fpclassify() const { return to_helper_t().fpclassify(); }
+  bool isfinite() const { return to_helper_t().isfinite(); }
+  bool isnormal() const { return to_helper_t().isnormal(); }
+  bool isinf() const { return to_helper_t().isinf(); }
+  bool isnan() const { return to_helper_t().isnan(); }
+  void set_data(short op) { ac::copy_bits(op, &d); }
+  void set_data(const ac_int<width,true> &op) { ac::copy_bits(op, &d); }
+  const data_t &data() const { return d; }
+  ac_int<16,true> data_ac_int() const { return ac_int<16,true>(d); }
+
+  // mirroed most constructors in tensorflow implementation (except template version)
+  //   tensorflow uses static_cast<float>
+  //   this implementation goes through ac_std_float so there is no dependency on rounding mode
+//  template <class T>
+//  explicit bfloat16(const T& val) { *this = bfloat16(static_cast<float>(val)); }
+  explicit bfloat16(unsigned short val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<16,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(int val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<32,true>(val) );
+    *this = t;
+  }
+  explicit bfloat16(unsigned int val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<32,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(long val) {
+    const int long_w = ac_private::long_w;
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<long_w,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(long long val) {
+    ac_std_float_t t;
+    t.assign_from<AC_TRN_ZERO>( ac_int<64,false>(val) );
+    *this = t;
+  }
+  explicit bfloat16(double val) { *this = bfloat16(ac_ieee_float<binary64>(val)); }
+
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 add(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().add<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 sub(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().sub<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 mult(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().mult<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 div(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().div<QR,No_SubNormals>(op2.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 fma(const bfloat16 &op2, const bfloat16 &op3) const {
+    return bfloat16(to_helper_t().fma<QR,No_SubNormals>(op2.to_helper_t(), op3.to_helper_t()));
+  }
+  template<ac_q_mode QR, bool No_SubNormals>
+  bfloat16 sqrt() const {
+    return bfloat16(to_helper_t().sqrt<QR,No_SubNormals>());
+  }
+
+  bfloat16 operator +(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().add<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 operator -(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().sub<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 operator *(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().mult<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 operator /(const bfloat16 &op2) const {
+    return bfloat16(to_helper_t().div<AC_TRN_ZERO,false>(op2.to_helper_t()));
+  }
+  bfloat16 &operator +=(const bfloat16 &op2) {
+    return *this = operator +(op2);
+  }
+  bfloat16 &operator -=(const bfloat16 &op2) {
+    return *this = operator -(op2);
+  }
+  bfloat16 &operator *=(const bfloat16 &op2) {
+    return *this = operator *(op2);
+  }
+  bfloat16 &operator /=(const bfloat16 &op2) {
+    return *this = operator /(op2);
+  }
+
+  bool operator ==(const bfloat16 &op2) const {
+    return to_helper_t() == op2.to_helper_t();
+  }
+  bool operator !=(const bfloat16 &op2) const {
+    return to_helper_t() != op2.to_helper_t();
+  }
+  bool operator <(const bfloat16 &op2) const {
+    return to_helper_t() < op2.to_helper_t();
+  }
+  bool operator >=(const bfloat16 &op2) const {
+    return to_helper_t() >= op2.to_helper_t();
+  }
+  bool operator >(const bfloat16 &op2) const {
+    return to_helper_t() > op2.to_helper_t();
+  }
+  bool operator <=(const bfloat16 &op2) const {
+    return to_helper_t() <= op2.to_helper_t();
+  }
+
+  bfloat16 operator -() const {
+    bfloat16 r(*this);
+    r.set_signbit(!this->signbit());
+    return r;
+  }
+  bfloat16 operator +() const {
+    return bfloat16(*this);
+  }
+  bfloat16 abs() const {
+    bfloat16 r(*this);
+    r.set_signbit(false);
+    return r;
+  }
+  bfloat16 copysign(const bfloat16 &op2) const {
+    bfloat16 r(*this);
+    r.set_signbit(this->signbit());
+    return r;
+  }
+  bool signbit() const { return d < 0; }
+  void set_signbit(bool s) {
+    ac_int<width,true> t(d);
+    t[width-1] = s;
+    d = t;
+  }
+  bfloat16 ceil() const { return to_helper_t().ceil(); }
+  bfloat16 floor() const { return to_helper_t().floor(); }
+  bfloat16 trunc() const { return to_helper_t().trunc(); }
+  bfloat16 round() const { return to_helper_t().round(); }
+};
+
+inline std::ostream& operator << (std::ostream &os, const ac::bfloat16 &x) {
+  os << x.to_float();
+  return os;
+}
+
+}
+
+template<int W, int E>
+template<ac_ieee_float_format Format>
+inline ac_std_float<W,E>::ac_std_float(const ac_ieee_float<Format> &f) {
+  *this = ac_std_float(f.to_ac_std_float());
+}
+
+template<int W, int E>
+inline ac_std_float<W,E>::ac_std_float(const ac::bfloat16 &f) {
+  *this = ac_std_float(f.to_ac_std_float());
+}
+
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format>::ac_ieee_float(const ac::bfloat16 &f) {
+  *this = ac_ieee_float(f.to_ac_std_float());
+}
+
+typedef ac_ieee_float<binary16> ac_ieee_float16;
+typedef ac_ieee_float<binary32> ac_ieee_float32;
+typedef ac_ieee_float<binary64> ac_ieee_float64;
+typedef ac_ieee_float<binary128> ac_ieee_float128;
+typedef ac_ieee_float<binary256> ac_ieee_float256;
+
+
+#ifdef __AC_NAMESPACE
+}
+#endif
+
+// Global functions for ac_ieee_float
+namespace std {
+#ifdef __AC_NAMESPACE
+using namespace __AC_NAMESPACE;
+#endif
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format> abs(const ac_ieee_float<Format> &x) { return x.abs(); }
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format> fabs(const ac_ieee_float<Format> &x) { return x.abs(); }
+
+template<ac_ieee_float_format Format>
+inline ac_ieee_float<Format> copysign(const ac_ieee_float<Format> &x, const ac_ieee_float<Format> &y) { return x.copysign(y); }
+
+template<ac_ieee_float_format Format>
+inline int fpclassify(const ac_ieee_float<Format> &x) { return x.fpclassify(); }
+template<ac_ieee_float_format Format>
+inline bool isfinite(const ac_ieee_float<Format> &x) { return x.isfinite(); }
+template<ac_ieee_float_format Format>
+inline bool isnormal(const ac_ieee_float<Format> &x) { return x.isnormal(); }
+template<ac_ieee_float_format Format>
+inline bool isinf(const ac_ieee_float<Format> &x) { return x.isinf(); }
+template<ac_ieee_float_format Format>
+inline bool isnan(const ac_ieee_float<Format> &x) { return x.isnan(); }
+
+// Don't do "long double" versions since they are 80-bits, it is an extended presicion
+// TODO: fmod, fmodf, fmodl
+// TODO: fmod, remainder, remquo, fma, fmax, fmin, fdim
+// remainder(x,y),  x - n*y, where n = x/y rounded to the nearest integer (RND_CONV)
+// remquo(x,y, int *quo),  returns same as remainder, unclear what quo is, also Nan, inf etc
+// fmax, fmin:  if one number is Nan, the other is returned
+// fdim(x,y) returns max(x-y,0), if x or y is NaN, a NaN is returned, if result overflows, HUGE_VAL is returned
+// TODO: ceil, floor, trunc, round, lround, nearbyint, rint, lrint, llround, llrint
+// if x is +0, -0, NaN or Inf, x is returned
+//   ceil(x), floor(x), trunc(x)
+//   round(x) : RND_INF
+//   nearbyint: depends on rounding mode
+//   rint, same as nearbyint, but may raise inexaxt exception (FE_INEXACT)
+// TODO: frexp, ldexp, modf, nextafter, nexttoward, copysign
+// modf(x, *iptr), modff   break into integral (*iptr) and fractional (returned) values,
+// Don't cause exception: isgreater, isgreaterequal, isless, islessequal, islessgreater, isunordered
+//  isunordered: x or y is NaN
+template<ac_ieee_float_format Format>
+inline bool signbit(const ac_ieee_float<Format> &x) { return x.signbit(); }
+
+// Global functions for bfloat16
+inline bool signbit(const ac::bfloat16 &x) { return x.signbit(); }
+
+inline int fpclassify(const ac::bfloat16 &x) { return x.fpclassify(); }
+inline bool isfinite(const ac::bfloat16 &x) { return x.isfinite(); }
+inline bool isnormal(const ac::bfloat16 &x) { return x.isnormal(); }
+inline bool isinf(const ac::bfloat16 &x) { return x.isinf(); }
+inline bool isnan(const ac::bfloat16 &x) { return x.isnan(); }
+}
+
+#undef __AC_DATA_PRIVATE
+#undef AC_STD_FLOAT_FX_DIV_OVERRIDE
+
+#endif
diff --git a/hls4ml/templates/quartus/ac_types/stream.h b/hls4ml/templates/quartus/ac_types/stream.h
index 7084644994..b19ad74d66 100644
--- a/hls4ml/templates/quartus/ac_types/stream.h
+++ b/hls4ml/templates/quartus/ac_types/stream.h
@@ -1,36 +1,36 @@
-#ifndef NNET_STREAM_H
-#define NNET_STREAM_H
-
-#include <deque>
-
-namespace nnet {
-
-/*
-* A struct with the same high-level functionality as Intel's HLS ihc::stream
-* This struct is used during GCC compilation / hls4ml model.predict(...)
-* This is because GCC does not have access to HLS source files (ihc::stream)
-* Software-wise, this struct behaves like a first-in, first-out (FIFO) buffer
-* However, it cannot be used for HLS synthesis, since it uses dynamic memory allocation (deque)
-*/
-template<typename T>
-struct stream {
-  private:
-    std::deque<T> _data;
-
-  public:
-    stream() {}
-
-    T read() {
-        T element = _data.front();
-        _data.pop_front();
-        return element; 
-    }
-
-    void write(const T& element) { 
-        _data.push_back(element);
-    }   
-};
-
-}
- 
+#ifndef NNET_STREAM_H
+#define NNET_STREAM_H
+
+#include <deque>
+
+namespace nnet {
+
+/*
+* A struct with the same high-level functionality as Intel's HLS ihc::stream
+* This struct is used during GCC compilation / hls4ml model.predict(...)
+* This is because GCC does not have access to HLS source files (ihc::stream)
+* Software-wise, this struct behaves like a first-in, first-out (FIFO) buffer
+* However, it cannot be used for HLS synthesis, since it uses dynamic memory allocation (deque)
+*/
+template<typename T>
+struct stream {
+  private:
+    std::deque<T> _data;
+
+  public:
+    stream() {}
+
+    T read() {
+        T element = _data.front();
+        _data.pop_front();
+        return element; 
+    }
+
+    void write(const T& element) { 
+        _data.push_back(element);
+    }   
+};
+
+}
+ 
 #endif
\ No newline at end of file
diff --git a/hls4ml/templates/quartus/firmware/defines.h b/hls4ml/templates/quartus/firmware/defines.h
index 49781dc963..c3fe4ec402 100644
--- a/hls4ml/templates/quartus/firmware/defines.h
+++ b/hls4ml/templates/quartus/firmware/defines.h
@@ -1,47 +1,47 @@
-#ifndef DEFINES_H_
-#define DEFINES_H_
-
-/*
- * Intel HLS makes use of three streaming interfaces:
- *   (1) stream_in - used as the main input to a component
- *   (2) stream_out - used as the main output of a component
- *   (3) stream - allows both reading and writing; used for inter-component connections
- * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
- * Therefore, variables of type 'stream' are always passed by reference
- */
-
-#ifndef __INTELFPGA_COMPILER__
-
-#include "ac_fixed.h"
-#include "ac_int.h"
-#define hls_register
-
-#include "stream.h"
-template <typename T> using stream = nnet::stream<T>;
-template <typename T> using stream_in = nnet::stream<T>;
-template <typename T> using stream_out = nnet::stream<T>;
-
-#else
-
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/hls.h"
-
-template <typename T> using stream = ihc::stream<T>;
-template <typename T> using stream_in = ihc::stream_in<T>;
-template <typename T> using stream_out = ihc::stream_out<T>;
-
-#endif
-
-// Include nnet::array - a custom array-like struct, mainly used with io_stream
-#include "nnet_utils/nnet_types.h"
-
-// hls-fpga-machine-learning insert numbers
-
-// hls-fpga-machine-learning insert layer-precision
-
-#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
-#define MIN(n, d) (n > d ? d : n)
-#define MAX(n, d) (n < d ? d : n)
-
-#endif
+#ifndef DEFINES_H_
+#define DEFINES_H_
+
+/*
+ * Intel HLS makes use of three streaming interfaces:
+ *   (1) stream_in - used as the main input to a component
+ *   (2) stream_out - used as the main output of a component
+ *   (3) stream - allows both reading and writing; used for inter-component connections
+ * ihc::stream has a implicitly deleted constructor and therefore, cannot be used as the output of a function/component
+ * Therefore, variables of type 'stream' are always passed by reference
+ */
+
+#ifndef __INTELFPGA_COMPILER__
+
+#include "ac_fixed.h"
+#include "ac_int.h"
+#define hls_register
+
+#include "stream.h"
+template <typename T> using stream = nnet::stream<T>;
+template <typename T> using stream_in = nnet::stream<T>;
+template <typename T> using stream_out = nnet::stream<T>;
+
+#else
+
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
+
+template <typename T> using stream = ihc::stream<T>;
+template <typename T> using stream_in = ihc::stream_in<T>;
+template <typename T> using stream_out = ihc::stream_out<T>;
+
+#endif
+
+// Include nnet::array - a custom array-like struct, mainly used with io_stream
+#include "nnet_utils/nnet_types.h"
+
+// hls-fpga-machine-learning insert numbers
+
+// hls-fpga-machine-learning insert layer-precision
+
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n < d ? d : n)
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/myproject.cpp b/hls4ml/templates/quartus/firmware/myproject.cpp
index 3f5749d611..acdde092ff 100644
--- a/hls4ml/templates/quartus/firmware/myproject.cpp
+++ b/hls4ml/templates/quartus/firmware/myproject.cpp
@@ -1,48 +1,48 @@
-#include "myproject.h"
-#include "parameters.h"
-
-// hls-fpga-machine-learning insert weights
-
-/*
- * Intel HLS requires that all 'stream' types are:
- *     (1) Passed by reference to the top-level entity or
- *     (2) Declared as global variables, outside of the main function
- * Therefore, layer inputs/output (connections betweenn individual layers) are declared here
- */
-// hls-fpga-machine-learning insert inter-task streams
-
-#ifndef __INTELFPGA_COMPILER__
-/*
-* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
-* An important distinction is made between io_stream and io_parallel:
-*     (1) io_parallel:
-               - Top-level function takes a struct containing an array as function argument
-               - Returns a struct containing an array - the prediction
-      (2) io_stream:
-               - Top-level function is 'void' - no return value
-               - Instead, both the input and output are passed by reference
-               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
-* This distinction is handled in quartus_writer.py
-*/
-// hls-fpga-machine-learning instantiate GCC top-level
-#else
-// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-// hls-fpga-machine-learning insert cpragmas
-
-/*
- * The top-level function used during HLS Synthesis goes here
- * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
- */
-// hls-fpga-machine-learning instantiate HLS top-level
-#endif
-// If using io_parallel, the output needs to be initialised and returned at the end of this function
-// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
-// hls-fpga-machine-learning initialize input/output
-
-// ****************************************
-// NETWORK INSTANTIATION
-// ****************************************
-
-// hls-fpga-machine-learning insert layers
-
-// hls-fpga-machine-learning return
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert weights
+
+/*
+ * Intel HLS requires that all 'stream' types are:
+ *     (1) Passed by reference to the top-level entity or
+ *     (2) Declared as global variables, outside of the main function
+ * Therefore, layer inputs/output (connections betweenn individual layers) are declared here
+ */
+// hls-fpga-machine-learning insert inter-task streams
+
+#ifndef __INTELFPGA_COMPILER__
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+// hls-fpga-machine-learning instantiate GCC top-level
+#else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
+// hls-fpga-machine-learning insert cpragmas
+
+/*
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
+#endif
+// If using io_parallel, the output needs to be initialised and returned at the end of this function
+// If using io_stream, no output is initialised, as it is passed by reference to the top-level function
+// hls-fpga-machine-learning initialize input/output
+
+// ****************************************
+// NETWORK INSTANTIATION
+// ****************************************
+
+// hls-fpga-machine-learning insert layers
+
+// hls-fpga-machine-learning return
diff --git a/hls4ml/templates/quartus/firmware/myproject.h b/hls4ml/templates/quartus/firmware/myproject.h
index afb7020671..d0f577d14d 100644
--- a/hls4ml/templates/quartus/firmware/myproject.h
+++ b/hls4ml/templates/quartus/firmware/myproject.h
@@ -1,48 +1,48 @@
-#ifndef MYPROJECT_H_
-#define MYPROJECT_H_
-
-#ifndef __INTELFPGA_COMPILER__
-#include "ac_fixed.h"
-#include "ac_int.h"
-#define hls_register
-#else
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/hls.h"
-#endif
-
-// Streams are explicitly defined in defines.h, which are included for parameters.h
-// Defining them again in this file will cause compile-time errors
-#include "defines.h"
-
-// If using io_parallel, inputs and output need to be initialised before calling the top-level function
-// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
-// hls-fpga-machine-learning insert inputs
-// hls-fpga-machine-learning insert outputs
-
-#ifndef __INTELFPGA_COMPILER__
-/*
-* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
-* An important distinction is made between io_stream and io_parallel:
-*     (1) io_parallel:
-               - Top-level function takes a struct containing an array as function argument
-               - Returns a struct containing an array - the prediction
-      (2) io_stream:
-               - Top-level function is 'void' - no return value
-               - Instead, both the input and output are passed by reference
-               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
-* This distinction is handled in quartus_writer.py
-*/
-// hls-fpga-machine-learning instantiate GCC top-level
-#else
-// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
-// hls-fpga-machine-learning insert cpragmas
-
-/*
- * The top-level function used during HLS Synthesis goes here
- * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
- */
-// hls-fpga-machine-learning instantiate HLS top-level
-#endif
-
-#endif
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#ifndef __INTELFPGA_COMPILER__
+#include "ac_fixed.h"
+#include "ac_int.h"
+#define hls_register
+#else
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/hls.h"
+#endif
+
+// Streams are explicitly defined in defines.h, which are included for parameters.h
+// Defining them again in this file will cause compile-time errors
+#include "defines.h"
+
+// If using io_parallel, inputs and output need to be initialised before calling the top-level function
+// If using io_stream, no inputs/outputs are initialised, as they are passed by reference to the top-level function
+// hls-fpga-machine-learning insert inputs
+// hls-fpga-machine-learning insert outputs
+
+#ifndef __INTELFPGA_COMPILER__
+/*
+* The top-level function used during GCC compilation / hls4ml.predic(...) goes here
+* An important distinction is made between io_stream and io_parallel:
+*     (1) io_parallel:
+               - Top-level function takes a struct containing an array as function argument
+               - Returns a struct containing an array - the prediction
+      (2) io_stream:
+               - Top-level function is 'void' - no return value
+               - Instead, both the input and output are passed by reference
+               - This is due the HLS Streaming Interfaces; stream cannot be copied (implicitly deleted copy constructor)
+* This distinction is handled in quartus_writer.py
+*/
+// hls-fpga-machine-learning instantiate GCC top-level
+#else
+// Maximum initiation interval, concurrency and frequency for HLS syntheis are defined here
+// hls-fpga-machine-learning insert cpragmas
+
+/*
+ * The top-level function used during HLS Synthesis goes here
+ * In a similar manner to GCC, there is a distinction between io_stream & io_parallel
+ */
+// hls-fpga-machine-learning instantiate HLS top-level
+#endif
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h
index cda8e748a1..7b84a9c0f2 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h
@@ -1,104 +1,104 @@
-#ifndef NNET_BATCHNORM_H_
-#define NNET_BATCHNORM_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct batchnorm_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-
-    // Default multiplication
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-// Calcuate result
-Result:
-    #pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
-        if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
-                        bias[ires];
-        } else {
-            int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
-                bias[norm_index];
-        }
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-struct batchnorm_quantized_tanh_config {
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-};
-
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
-                           const data_T threshold[CONFIG_T::n_scale_bias]) {
-    #pragma unroll
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        ac_int<1, false> cache;
-        data_T datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg >= threshold[norm_index])
-            cache = 1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
-                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
-                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
-    #pragma unroll
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        ac_int<2, true> cache;
-        data_T datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg > threshold_hi[norm_index])
-            cache = 1;
-        else if (datareg <= threshold_lo[norm_index])
-            cache = -1;
-        else
-            cache = 0;
-        res[ii] = cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               const typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               const typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+// Calcuate result
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ac_int<1, false> res[CONFIG_T::n_in],
+                           const data_T threshold[CONFIG_T::n_scale_bias]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<1, false> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ac_int<2, true> res[CONFIG_T::n_in],
+                            const data_T threshold_hi[CONFIG_T::n_scale_bias],
+                            const data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma unroll
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        ac_int<2, true> cache;
+        data_T datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
index 5e5c1fa24d..1af60ab0c5 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_common.h
@@ -1,71 +1,71 @@
-#ifndef NNET_COMMON_H_
-#define NNET_COMMON_H_
-
-#ifndef __INTELFPGA_COMPILER__
-#include "ac_fixed.h"
-#include "ac_int.h"
-#include "math.h"
-#else
-#include "HLS/ac_fixed.h"
-#include "HLS/ac_int.h"
-#include "HLS/math.h"
-#endif
-
-#include "nnet_helpers.h"
-
-typedef ac_fixed<16, 6> table_default_t;
-
-namespace nnet {
-
-// Common type definitions
-enum io_type { io_parallel = 0, io_stream };
-
-// Default data types (??) TODO: Deprecate
-typedef ac_fixed<16, 4> weight_t_def;
-typedef ac_fixed<16, 4> bias_t_def;
-typedef ac_fixed<32, 10> accum_t_def;
-
-template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
-    #pragma unroll
-    for (int ii = 0; ii < NIN1; ii++) {
-        res[ii] = data1[ii];
-    }
-    #pragma unroll
-    for (int ii = 0; ii < NIN2; ii++) {
-        res[NIN1 + ii] = data2[ii];
-    }
-}
-
-/* ---
- * Balanced tree reduce implementation.
- * For use in scenarios where Quartus cannot expression balance
- * Reduces an array of inputs to a single value using the template binary operator 'Op',
- * for example summing all elements with Op_add, or finding the maximum with Op_max
- * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
- * before applying and accumulate the result over the rolled dimension.
- * --- */
-template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
-    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
-    if (N == 1) {
-        return x[0];
-    }
-    if (N == 2) {
-        return op(x[0], x[1]);
-    }
-    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
-}
-
-template <class T> class Op_add {
-  public:
-    T operator()(T a, T b) { return a + b; }
-};
-
-template <class T> class Op_max {
-  public:
-    T operator()(T a, T b) { return a >= b ? a : b; }
-};
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#ifndef __INTELFPGA_COMPILER__
+#include "ac_fixed.h"
+#include "ac_int.h"
+#include "math.h"
+#else
+#include "HLS/ac_fixed.h"
+#include "HLS/ac_int.h"
+#include "HLS/math.h"
+#endif
+
+#include "nnet_helpers.h"
+
+typedef ac_fixed<16, 6> table_default_t;
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+
+// Default data types (??) TODO: Deprecate
+typedef ac_fixed<16, 4> weight_t_def;
+typedef ac_fixed<16, 4> bias_t_def;
+typedef ac_fixed<32, 10> accum_t_def;
+
+template <class data_T, int NIN1, int NIN2> void merge(data_T data1[NIN1], data_T data2[NIN2], data_T res[NIN1 + NIN2]) {
+    #pragma unroll
+    for (int ii = 0; ii < NIN1; ii++) {
+        res[ii] = data1[ii];
+    }
+    #pragma unroll
+    for (int ii = 0; ii < NIN2; ii++) {
+        res[NIN1 + ii] = data2[ii];
+    }
+}
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Quartus cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
index 579606519f..8897e13150 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_conv1d.h
@@ -1,64 +1,64 @@
-#ifndef NNET_CONV1D_H_
-#define NNET_CONV1D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv1d_resource.h"
-
-namespace nnet {
-
-struct conv1d_config {
-    // I/O sizes
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-
-    // Number of channels, filters
-    static const unsigned n_chan = 1;
-    static const unsigned n_filt = 1;
-
-    // Original filter size
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_width;
-
-    // Modified filter size (post-Wionograd transformation, if applied)
-    static const unsigned impl_filt_height = 1;
-    static const unsigned impl_filt_width = 1;
-
-    // Padding, stride, dilation
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned stride_width = 1;
-    static const unsigned dilation = 1;
-
-    // Run-time Configuration
-    static const unsigned n_zeros = 0;
-    static const unsigned reuse_factor = 1;
-    static const unsigned parallelisation_factor = 1;
-
-    // TODO: BRAM Storage on Quartus
-    static const bool store_weights_in_bram = false;
-
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_resource.h"
+
+namespace nnet {
+
+struct conv1d_config {
+    // I/O sizes
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+
+    // Number of channels, filters
+    static const unsigned n_chan = 1;
+    static const unsigned n_filt = 1;
+
+    // Original filter size
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+
+    // Modified filter size (post-Wionograd transformation, if applied)
+    static const unsigned impl_filt_height = 1;
+    static const unsigned impl_filt_width = 1;
+
+    // Padding, stride, dilation
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+
+    // Run-time Configuration
+    static const unsigned n_zeros = 0;
+    static const unsigned reuse_factor = 1;
+    static const unsigned parallelisation_factor = 1;
+
+    // TODO: BRAM Storage on Quartus
+    static const bool store_weights_in_bram = false;
+
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                const typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          const typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          const typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+    pointwise_conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
index d969403c3e..aba0803989 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense.h
@@ -1,169 +1,169 @@
-#ifndef NNET_DENSE_LARGE_H_
-#define NNET_DENSE_LARGE_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct dense_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 10;
-
-    static const unsigned reuse_factor = 1;
-    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
-    static const unsigned multiplier_factor = 1; // min n_in, rf
-    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
-    static const unsigned reciprocal = 1;        // 2^35 / 25
-    static const unsigned rf_pad = 0;
-    static const unsigned bf_pad = 0;
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-
-    // Default multiplication
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_gt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
-           "The current Reuse Factor is not allowed");
-    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
-    //#pragma ii CONFIG_T::reuse_factor
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-Load:
-    #pragma unroll
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
-    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
-
-    #pragma unroll
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + CONFIG_T::reuse_factor * im;
-            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor).to_int();
-            d_index[ir][im] = w_index % CONFIG_T::n_in;
-        }
-    }
-Product1:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
-    Product2:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
-            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
-                continue;
-            int data_index = d_index[ir][im];
-            // Modified this
-            tmp_acc[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
-        }
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
-    ResetMult:
-        #pragma unroll
-        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
-            mult[imult] = 0;
-        }
-    AccumLoop1:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::block_factor; im++) {
-            int o_index = out_index[ir][im];
-            if (o_index >= CONFIG_T::n_out)
-                continue; // check out of bounds
-            mult[o_index] += tmp_acc[im];
-        }
-    AccumLoop2:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
-            acc[im] += mult[im];
-        }
-    }
-Store:
-    #pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
-    }
-}
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_rf_lt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
-           "The current Reuse Factor is not allowed");
-    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
-
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-InitAccum:
-    #pragma unroll
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-ReuseLoop:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
-    MultLoop:
-        #pragma unroll
-        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
-            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
-            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
-                continue;
-            // Modified this
-            mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
-            in_index += CONFIG_T::reuse_factor;
-            if (in_index >= CONFIG_T::n_in)
-                in_index = ir;
-        }
-    AccumLoop:
-        #pragma unroll
-        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
-            acc[out_index] += mult[im];
-            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
-                acc_step = 0;
-                out_index++;
-            } else {
-                acc_step++;
-            }
-        }
-    }
-// Cast to "res_t" type
-Result:
-    #pragma unroll
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(
-    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
-    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-} // namespace nnet
-#endif
+#ifndef NNET_DENSE_LARGE_H_
+#define NNET_DENSE_LARGE_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    static const unsigned reuse_factor = 1;
+    static const unsigned block_factor = 1;      // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    static const unsigned multiplier_limit = 1;  // DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor)
+    static const unsigned multiplier_factor = 1; // min n_in, rf
+    static const unsigned multiplier_scale = 1;  // M_LIMIT/CONFIG_T::n_out;
+    static const unsigned reciprocal = 1;        // 2^35 / 25
+    static const unsigned rf_pad = 0;
+    static const unsigned bf_pad = 0;
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+
+    // Default multiplication
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_gt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::reuse_factor > CONFIG_T::n_in) && "This function is correct only for RF > N_IN");
+    //#pragma ii CONFIG_T::reuse_factor
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+Load:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+    hls_register int d_index[CONFIG_T::reuse_factor][CONFIG_T::block_factor];
+
+    #pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + CONFIG_T::reuse_factor * im;
+            out_index[ir][im] = (w_index / CONFIG_T::multiplier_factor).to_int();
+            d_index[ir][im] = w_index % CONFIG_T::n_in;
+        }
+    }
+Product1:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::block_factor];
+    Product2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (w_index >= CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded)
+                continue;
+            int data_index = d_index[ir][im];
+            // Modified this
+            tmp_acc[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[data_index], weights[w_index]);
+        }
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::multiplier_limit];
+    ResetMult:
+        #pragma unroll
+        for (int imult = 0; imult < CONFIG_T::multiplier_limit; imult++) {
+            mult[imult] = 0;
+        }
+    AccumLoop1:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::block_factor; im++) {
+            int o_index = out_index[ir][im];
+            if (o_index >= CONFIG_T::n_out)
+                continue; // check out of bounds
+            mult[o_index] += tmp_acc[im];
+        }
+    AccumLoop2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::multiplier_limit; im++) {
+            acc[im] += mult[im];
+        }
+    }
+Store:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]); // acc[jj];
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_rf_lt(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                 const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+                 const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    assert((CONFIG_T::multiplier_limit % CONFIG_T::n_out == 0 || CONFIG_T::reuse_factor >= CONFIG_T::n_in) &&
+           "The current Reuse Factor is not allowed");
+    assert((CONFIG_T::multiplier_limit == CONFIG_T::block_factor) && "This function is correct only for RF <= N_IN");
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+InitAccum:
+    #pragma unroll
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+ReuseLoop:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::block_factor];
+    MultLoop:
+        #pragma unroll
+        for (int im = 0, in_index = ir; im < CONFIG_T::block_factor; im++) {
+            uint32 w_index = ir + (CONFIG_T::reuse_factor_rounded)*im;
+            if (ir + CONFIG_T::reuse_factor * im >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue;
+            // Modified this
+            mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+            in_index += CONFIG_T::reuse_factor;
+            if (in_index >= CONFIG_T::n_in)
+                in_index = ir;
+        }
+    AccumLoop:
+        #pragma unroll
+        for (int im = 0, out_index = 0, acc_step = 0; im < CONFIG_T::block_factor; im++) {
+            acc[out_index] += mult[im];
+            if (acc_step + 1 >= CONFIG_T::multiplier_scale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+// Cast to "res_t" type
+Result:
+    #pragma unroll
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(
+    data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+    const typename CONFIG_T::weight_t weights[CONFIG_T::reuse_factor_rounded * CONFIG_T::block_factor_rounded],
+    const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h
index ff261482ba..5619e299fb 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_dense_compressed.h
@@ -1,80 +1,80 @@
-#ifndef NNET_COMPRESSED_LAYER_H_
-#define NNET_COMPRESSED_LAYER_H_
-
-#include "nnet_common.h"
-#include "nnet_dense.h"
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
-                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-
-InitAccum:
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_out; i++) {
-        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
-    }
-
-    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
-
-    #pragma unroll
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32 w = ir + CONFIG_T::reuse_factor * im;
-            inputs[ir][im] = data[weights[w].row_index];
-            out_index[ir][im] = weights[w].col_index;
-        }
-    }
-ReuseLoop:
-    #pragma nofusion
-    #pragma speculated_iterations 0
-    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
-        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
-    CompressedMultLoop:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            uint32 w = ir + CONFIG_T::reuse_factor * im;
-            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
-            typename CONFIG_T::accum_t prod = mult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
-            #pragma unroll
-            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
-                inputs[is][im] = inputs[is + 1][im];
-            }
-        }
-        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
-    ResetMult:
-        #pragma unroll
-        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
-            tmp_acc[tacc] = 0;
-        }
-    AccumLoop1:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
-            int col = out_index[ir][im];
-            tmp_acc[col] += mult[im];
-        }
-    AccumLoop2:
-        #pragma unroll
-        for (int im = 0; im < CONFIG_T::n_out; im++) {
-            acc[im] += tmp_acc[im];
-        }
-    }
-
-// Cast to "res_t" type
-ResultLoop:
-    #pragma unroll
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "nnet_common.h"
+#include "nnet_dense.h"
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      const typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      const typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    hls_register typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+InitAccum:
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_out; i++) {
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    hls_register int out_index[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+    hls_register data_T inputs[CONFIG_T::reuse_factor][CONFIG_T::compressed_block_factor];
+
+    #pragma unroll
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32 w = ir + CONFIG_T::reuse_factor * im;
+            inputs[ir][im] = data[weights[w].row_index];
+            out_index[ir][im] = weights[w].col_index;
+        }
+    }
+ReuseLoop:
+    #pragma nofusion
+    #pragma speculated_iterations 0
+    for (int ir = 0; ir < CONFIG_T::reuse_factor; ir++) {
+        hls_register typename CONFIG_T::accum_t mult[CONFIG_T::compressed_block_factor];
+    CompressedMultLoop:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            uint32 w = ir + CONFIG_T::reuse_factor * im;
+            // if (w >= CONFIG_T::reuse_factor*CONFIG_T::compressed_block_factor) continue;
+            typename CONFIG_T::accum_t prod = mult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(inputs[0][im], weights[w].weight);
+            #pragma unroll
+            for (int is = 0; is < CONFIG_T::reuse_factor - 1; is++) {
+                inputs[is][im] = inputs[is + 1][im];
+            }
+        }
+        hls_register typename CONFIG_T::accum_t tmp_acc[CONFIG_T::n_out];
+    ResetMult:
+        #pragma unroll
+        for (int tacc = 0; tacc < CONFIG_T::n_out; tacc++) {
+            tmp_acc[tacc] = 0;
+        }
+    AccumLoop1:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::compressed_block_factor; im++) {
+            int col = out_index[ir][im];
+            tmp_acc[col] += mult[im];
+        }
+    AccumLoop2:
+        #pragma unroll
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += tmp_acc[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
index 775303e267..3bd78c7a84 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_helpers.h
@@ -1,140 +1,140 @@
-#ifndef NNET_HELPERS_H
-#define NNET_HELPERS_H
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <sstream>
-#include <stdio.h>
-#include <stdlib.h>
-
-namespace nnet {
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = dstType(src[i]);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = static_cast<dstType>(src[i].to_double());
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, stream_in<dstType> &dst) {
-    for (size_t i = 0; i < SIZE / dstType::size; i++) {
-        dstType ctype;
-        for (size_t j = 0; j < dstType::size; j++) {
-            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
-        }
-        dst.write(ctype);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data_back(stream_out<srcType> &src, dstType *dst) {
-    for (size_t i = 0; i < SIZE / srcType::size; i++) {
-        srcType ctype = src.read();
-        for (size_t j = 0; j < srcType::size; j++) {
-            dst[i * srcType::size + j] = dstType(ctype[j].to_double());
-        }
-    }
-}
-
-extern bool trace_enabled;
-extern std::map<std::string, void *> *trace_outputs;
-extern size_t trace_type_size;
-
-constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
-
-constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
-
-constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
-
-template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
-    for (int i = 0; i < layer_size; i++) {
-        ptr[i] = static_cast<save_T>(data[i].to_double());
-    }
-}
-
-template <class data_T, class save_T> void save_output_array(stream<data_T> &data, save_T *ptr, size_t layer_size) {
-    for (size_t i = 0; i < layer_size / data_T::size; i++) {
-        data_T ctype = data.read();
-        for (size_t j = 0; j < data_T::size; j++) {
-            ptr[i * data_T::size + j] = static_cast<save_T>(ctype[j].to_double());
-        }
-        data.write(ctype);
-    }
-}
-
-// We don't want to include save_T in this function because it will be inserted into myproject.cpp
-// so a workaround with element size is used
-template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (int i = 0; i < layer_size; i++) {
-            out << data[i] << " "; // We don't care about precision in text files
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-template <class data_T> void save_layer_output(stream<data_T> &data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (size_t i = 0; i < layer_size / data_T::size; i++) {
-            data_T ctype = data.read();
-            for (size_t j = 0; j < data_T::size; j++) {
-                out << ctype[j] << " ";
-            }
-            data.write(ctype);
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+
+namespace nnet {
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data_back(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = static_cast<dstType>(src[i].to_double());
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, stream_in<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data_back(stream_out<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j].to_double());
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = static_cast<save_T>(data[i].to_double());
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(stream<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = static_cast<save_T>(ctype[j].to_double());
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << data[i] << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(stream<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << ctype[j] << " ";
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
index dc27de99ff..766ef2e208 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_merge.h
@@ -1,249 +1,249 @@
-#ifndef NNET_MERGE_H_
-#define NNET_MERGE_H_
-
-#include "nnet_mult.h"
-
-namespace nnet {
-
-struct merge_config {
-    static const unsigned n_elem = 10;
-};
-
-struct dot_config {
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 1;
-
-    static const unsigned reuse_factor = 1;
-
-    typedef float accum_t;
-
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-struct concat_config {
-    static const unsigned n_elem1_0 = 10;
-    static const unsigned n_elem1_1 = 10;
-    static const unsigned n_elem1_2 = 10;
-    static const unsigned n_elem2_0 = 10;
-    static const unsigned n_elem2_1 = 10;
-    static const unsigned n_elem2_2 = 10;
-
-    static const unsigned axis = -1;
-};
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] + data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] - data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>(data1[i] * data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem; i++) {
-        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
-    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-
-    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
-Product:
-    #pragma unroll multiplier_limit
-    for (int i = 0; i < CONFIG_T::n_in; i++) {
-        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
-    }
-
-    hls_register typename CONFIG_T::accum_t acc = 0;
-Accum:
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_in; i++) {
-        acc += mult[i];
-    }
-
-    res[0] = static_cast<res_T>(acc);
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
-                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
-                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
-        }
-
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
-                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
-        res[i] = static_cast<res_T>(data1[i]);
-    }
-
-    #pragma unroll
-    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx =
-                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
-            }
-        }
-
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
-                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
-                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
-                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
-                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
-                res[res_idx] = static_cast<res_T>(data1[data_idx]);
-            }
-
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
-                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
-                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
-                res[res_idx] = static_cast<res_T>(data2[data_idx]);
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "nnet_mult.h"
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+
+    static const unsigned reuse_factor = 1;
+
+    typedef float accum_t;
+
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] + data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] - data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>(data1[i] * data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = static_cast<res_T>((data1[i] + data2[i]) / (res_T)2);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] > data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem; i++) {
+        res[i] = (data1[i] < data2[i]) ? static_cast<res_T>(data1[i]) : static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+
+    hls_register typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+Product:
+    #pragma unroll multiplier_limit
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        mult[i] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i], data2[i]);
+    }
+
+    hls_register typename CONFIG_T::accum_t acc = 0;
+Accum:
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        acc += mult[i];
+    }
+
+    res[0] = static_cast<res_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        res[CONFIG_T::n_elem1_0 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + j] =
+                static_cast<res_T>(data1[i * CONFIG_T::n_elem1_1 + j]);
+        }
+
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            res[i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + j] =
+                static_cast<res_T>(data2[i * CONFIG_T::n_elem2_1 + j]);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; i++) {
+        res[i] = static_cast<res_T>(data1[i]);
+    }
+
+    #pragma unroll
+    for (int i = 0; i < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; i++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + i] = static_cast<res_T>(data2[i]);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx =
+                    i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+        }
+
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem2_2; k++) {
+                int res_idx = i * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (j + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + k;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k;
+                int data_idx = i * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + j * CONFIG_T::n_elem1_2 + k;
+                res[res_idx] = static_cast<res_T>(data1[data_idx]);
+            }
+
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_elem1_2; k++) {
+                int res_idx = i * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              j * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + k + CONFIG_T::n_elem1_2;
+                int data_idx = i * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + j * CONFIG_T::n_elem2_2 + k;
+                res[res_idx] = static_cast<res_T>(data2[data_idx]);
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h
index 6819684f2a..5be7728323 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_mult.h
@@ -1,113 +1,113 @@
-#ifndef NNET_MULT_H_
-#define NNET_MULT_H_
-
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <math.h>
-
-namespace nnet {
-
-//  Different methods to perform the product of input and weight, depending on their types.
-namespace product {
-
-class Product {
-  public:
-    static void limit(unsigned multiplier_limit) {}
-};
-
-template <class x_T, class w_T> class both_binary : public Product {
-  public:
-    inline static x_T product(x_T a, w_T w) {
-        // specialisation for 1-bit weights and incoming data
-        return a == w;
-    }
-};
-
-template <class x_T, class w_T> class weight_binary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 1-bit weights, arbitrary data
-        if (w == 0)
-            return -a;
-        else
-            return a;
-    }
-};
-
-template <class x_T, class w_T> class data_binary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-w) {
-        // Specialisation for 1-bit data, arbitrary weight
-        if (a == 0)
-            return -w;
-        else
-            return w;
-    }
-};
-
-template <class x_T, class w_T> class weight_ternary : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 2-bit weights, arbitrary data
-        if (w == 0)
-            return 0;
-        else if (w == -1)
-            return -a;
-        else
-            return a; // if(w == 1)
-    }
-};
-
-template <class x_T, class w_T> class mult : public Product {
-  public:
-    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
-        // 'Normal' product
-        return a * w;
-    }
-    static void limit(unsigned multiplier_limit) {
-        // TODO: Implement for Quartus
-        // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
-        // pragma
-    }
-};
-
-template <class x_T, class w_T> class weight_exponential : public Product {
-  public:
-    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
-    inline static r_T product(x_T a, w_T w) {
-        // Shift product for exponential weights
-        // Shift by the exponent. Negative weights shift right
-        r_T y = static_cast<r_T>(a) << w.weight;
-
-        // Negate or not depending on weight sign
-        return w.sign == 1 ? y : static_cast<r_T>(-y);
-    }
-};
-} // namespace product
-
-// TO-DO: These may need extra variants if ac_int types are used in more places
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
-                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
-                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
-                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
-                               res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<res_T>(x);
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return static_cast<res_T>(x);
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+//  Different methods to perform the product of input and weight, depending on their types.
+namespace product {
+
+class Product {
+  public:
+    static void limit(unsigned multiplier_limit) {}
+};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    inline static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    inline static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        return a * w;
+    }
+    static void limit(unsigned multiplier_limit) {
+        // TODO: Implement for Quartus
+        // #pragma HLS ALLOCATION instances=mul limit=multiplier_limit operation > Vivado-only, replace with Intel HLS
+        // pragma
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ac_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width), true>;
+    inline static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+} // namespace product
+
+// TO-DO: These may need extra variants if ac_int types are used in more places
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<ac_int<nnet::ceillog2(CONFIG_T::n_in) + 2, true>>(((x - CONFIG_T::n_in / 2) * 2).to_ac_int());
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ac_int<1, false>>::value &&
+                                   !std::is_same<typename CONFIG_T::weight_t, ac_int<1, false>>::value,
+                               res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ac_int<1, false>>::value), res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return static_cast<res_T>(x);
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h
index 498cebf520..a95f9ab003 100644
--- a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h
+++ b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_padding.h
@@ -1,99 +1,99 @@
-#ifndef NNET_PADDING_H_
-#define NNET_PADDING_H_
-
-namespace nnet {
-
-struct padding1d_config {
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-    static const unsigned n_chan = 10;
-
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    for (int i = 0; i < CONFIG_T::pad_left; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_width; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = (res_T) * (data++);
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_right; i++) {
-        #pragma unroll
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-struct padding2d_config {
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-
-    static const unsigned n_chan = 10;
-
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    for (int i = 0; i < CONFIG_T::pad_top; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_height; i++) {
-        for (int j = 0; j < CONFIG_T::pad_left; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-        for (int j = 0; j < CONFIG_T::in_width; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = (res_T) * (data++);
-            }
-        }
-        for (int j = 0; j < CONFIG_T::pad_right; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            #pragma unroll
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        #pragma unroll
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+
+    static const unsigned n_chan = 10;
+
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            #pragma unroll
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/quartus/myproject_test_parallel.cpp b/hls4ml/templates/quartus/myproject_test_parallel.cpp
index 4de819eb49..5e3dd96c12 100644
--- a/hls4ml/templates/quartus/myproject_test_parallel.cpp
+++ b/hls4ml/templates/quartus/myproject_test_parallel.cpp
@@ -1,112 +1,112 @@
-#include <algorithm>
-#include <cctype>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "firmware/myproject.h"
-#include "firmware/parameters.h"
-
-// hls-fpga-machine-learning insert bram
-
-#define CHECKPOINT 5000
-
-// This function is written to avoid stringstream, which is
-// not supported in cosim 20.1, and because strtok
-// requires a const_cast or allocation to use with std::strings.
-// This function returns the next float (by argument) at position pos,
-// updating pos. True is returned if conversion done, false if the string
-// has ended, and std::invalid_argument exception if the sting was bad.
-bool nextToken(const std::string &str, std::size_t &pos, float &val) {
-    while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
-        pos++;
-    }
-    if (pos >= str.size()) {
-        return false;
-    }
-    std::size_t offset = 0;
-    val = std::stof(str.substr(pos), &offset);
-    pos += offset;
-    return true;
-}
-
-int main(int argc, char **argv) {
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
-    std::string RESULTS_LOG = "tb_data/results.log";
-    std::ofstream fout(RESULTS_LOG);
-
-    std::string iline;
-    std::string pline;
-
-    std::vector<input_data> inputs;
-    std::vector<output_data> outputs;
-
-    if (fin.is_open() && fpr.is_open()) {
-        std::vector<std::vector<float>> predictions;
-        unsigned int num_iterations = 0;
-        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
-            if (num_iterations % CHECKPOINT == 0) {
-                std::cout << "Processing input " << num_iterations << std::endl;
-            }
-
-            std::vector<float> in;
-            std::vector<float> pr;
-            float current;
-
-            std::size_t pos = 0;
-            while (nextToken(iline, pos, current)) {
-                in.push_back(current);
-            }
-
-            pos = 0;
-            while (nextToken(pline, pos, current)) {
-                pr.push_back(current);
-            }
-
-            // hls-fpga-machine-learning insert data
-            predictions.push_back(std::move(pr));
-        }
-
-        // Do this separately to avoid vector reallocation
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert run
-
-        for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert tb-output
-            if (j % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-            }
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        const unsigned int num_iterations = 10;
-        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
-                  << " invocations." << std::endl;
-        // hls-fpga-machine-learning insert zero
-
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert run
-
-        for (int j = 0; j < num_iterations; j++) {
-            // hls-fpga-machine-learning insert output
-
-            // hls-fpga-machine-learning insert tb-output
-        }
-    }
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
-    return 0;
-}
+#include <algorithm>
+#include <cctype>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/parameters.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+// This function is written to avoid stringstream, which is
+// not supported in cosim 20.1, and because strtok
+// requires a const_cast or allocation to use with std::strings.
+// This function returns the next float (by argument) at position pos,
+// updating pos. True is returned if conversion done, false if the string
+// has ended, and std::invalid_argument exception if the sting was bad.
+bool nextToken(const std::string &str, std::size_t &pos, float &val) {
+    while (pos < str.size() && std::isspace(static_cast<unsigned char>(str[pos]))) {
+        pos++;
+    }
+    if (pos >= str.size()) {
+        return false;
+    }
+    std::size_t offset = 0;
+    val = std::stof(str.substr(pos), &offset);
+    pos += offset;
+    return true;
+}
+
+int main(int argc, char **argv) {
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+    std::string RESULTS_LOG = "tb_data/results.log";
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+
+    std::vector<input_data> inputs;
+    std::vector<output_data> outputs;
+
+    if (fin.is_open() && fpr.is_open()) {
+        std::vector<std::vector<float>> predictions;
+        unsigned int num_iterations = 0;
+        for (; std::getline(fin, iline) && std::getline(fpr, pline); num_iterations++) {
+            if (num_iterations % CHECKPOINT == 0) {
+                std::cout << "Processing input " << num_iterations << std::endl;
+            }
+
+            std::vector<float> in;
+            std::vector<float> pr;
+            float current;
+
+            std::size_t pos = 0;
+            while (nextToken(iline, pos, current)) {
+                in.push_back(current);
+            }
+
+            pos = 0;
+            while (nextToken(pline, pos, current)) {
+                pr.push_back(current);
+            }
+
+            // hls-fpga-machine-learning insert data
+            predictions.push_back(std::move(pr));
+        }
+
+        // Do this separately to avoid vector reallocation
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert run
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert tb-output
+            if (j % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        const unsigned int num_iterations = 10;
+        std::cout << "INFO: Unable to open input/predictions file, using default input with " << num_iterations
+                  << " invocations." << std::endl;
+        // hls-fpga-machine-learning insert zero
+
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert run
+
+        for (int j = 0; j < num_iterations; j++) {
+            // hls-fpga-machine-learning insert output
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/vivado/ap_types/ap_common.h b/hls4ml/templates/vivado/ap_types/ap_common.h
index 02575e87c1..4d2886cbde 100644
--- a/hls4ml/templates/vivado/ap_types/ap_common.h
+++ b/hls4ml/templates/vivado/ap_types/ap_common.h
@@ -1,376 +1,376 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_COMMON_H__
-#define __AP_COMMON_H__
-
-// ----------------------------------------------------------------------
-
-// Forward declaration of all AP types.
-#include <ap_decl.h>
-
-
-#ifdef __SYNTHESIS__
-#error "The open-source version of AP types does not support synthesis."
-#endif // ifdef __SYNTHESIS__
-#define _AP_ENABLE_HALF_ 0
-
-
-#if _AP_ENABLE_HALF_ == 1
-// Before ap_private definition.
-#ifdef __SYNTHESIS__
-#define _HLS_HALF_DEFINED_
-typedef __fp16 half;
-#else
-class half;
-#endif // __SYNTHESIS__
-#endif // _AP_ENABLE_HALF_
-
-// ----------------------------------------------------------------------
-
-// Macro functions
-#define AP_MAX(a, b) ((a) > (b) ? (a) : (b))
-#define AP_MIN(a, b) ((a) < (b) ? (a) : (b))
-#define AP_ABS(a) ((a) >= 0 ? (a) : -(a))
-
-#ifndef AP_ASSERT
-#ifndef __SYNTHESIS__
-#include <assert.h>
-#define AP_ASSERT(cond, msg) assert((cond) && (msg))
-#else
-#define AP_ASSERT(cond, msg)
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_ASSERT
-
-#ifndef __SYNTHESIS__
-// for fprintf messages.
-#include <stdio.h>
-// for exit on error.
-#include <stdlib.h>
-#endif
-
-// same disable condition as assert.
-#if !defined(__SYNTHESIS__) && !defined(NDEBUG)
-
-#define _AP_DEBUG(cond, ...)                  \
-  do {                                        \
-    if ((cond)) {                             \
-      fprintf(stderr, "DEBUG: " __VA_ARGS__); \
-      fprintf(stderr, "\n");                  \
-    }                                         \
-  } while (0)
-#define _AP_WARNING(cond, ...)                  \
-  do {                                          \
-    if ((cond)) {                               \
-      fprintf(stderr, "WARNING: " __VA_ARGS__); \
-      fprintf(stderr, "\n");                    \
-    }                                           \
-  } while (0)
-#define _AP_ERROR(cond, ...)                  \
-  do {                                        \
-    if ((cond)) {                             \
-      fprintf(stderr, "ERROR: " __VA_ARGS__); \
-      fprintf(stderr, "\n");                  \
-      abort();                                \
-    }                                         \
-  } while (0)
-
-#else // if !defined(__SYNTHESIS__) && !defined(NDEBUG)
-
-#define __AP_VOID_CAST static_cast<void>
-#define _AP_DEBUG(cond, ...) (__AP_VOID_CAST(0))
-#define _AP_WARNING(cond, ...) (__AP_VOID_CAST(0))
-#define _AP_ERROR(cond, ...) (__AP_VOID_CAST(0))
-
-#endif // if !defined(__SYNTHESIS__) && !defined(NDEBUG) else
-
-// ----------------------------------------------------------------------
-
-// Attribute only for synthesis
-#ifdef __SYNTHESIS__
-#define INLINE inline __attribute__((always_inline))
-//#define INLINE inline __attribute__((noinline))
-#else
-#define INLINE inline
-#endif
-
-#define AP_WEAK
-// __attribute__((weak))
-
-#ifndef AP_INT_MAX_W
-#define AP_INT_MAX_W 1024
-#endif
-
-#define BIT_WIDTH_UPPER_LIMIT (1 << 15)
-#if AP_INT_MAX_W > BIT_WIDTH_UPPER_LIMIT
-#error "Bitwidth exceeds 32768 (1 << 15), the maximum allowed value"
-#endif
-
-#define MAX_MODE(BITS) ((BITS + 1023) / 1024)
-
-// ----------------------------------------------------------------------
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-// for overload operator<<
-#include <iostream>
-#endif
-#endif // ifndef AP_AUTOCC
-
-#ifndef __SYNTHESIS__
-// for string format.
-#include <sstream>
-// for string.
-#include <string>
-#endif
-
-// for detecting if char is signed.
-enum { CHAR_IS_SIGNED = (char)-1 < 0 };
-
-// TODO we have similar traits in x_hls_utils.h, should consider unify.
-namespace _ap_type {
-template <typename _Tp>
-struct is_signed {
-  static const bool value = _Tp(-1) < _Tp(1);
-};
-
-template <typename _Tp>
-struct is_integral {
-  static const bool value = false;
-};
-#define DEF_IS_INTEGRAL(CTYPE)      \
-  template <>                       \
-  struct is_integral<CTYPE> {       \
-    static const bool value = true; \
-  };
-DEF_IS_INTEGRAL(bool)
-DEF_IS_INTEGRAL(char)
-DEF_IS_INTEGRAL(signed char)
-DEF_IS_INTEGRAL(unsigned char)
-DEF_IS_INTEGRAL(short)
-DEF_IS_INTEGRAL(unsigned short)
-DEF_IS_INTEGRAL(int)
-DEF_IS_INTEGRAL(unsigned int)
-DEF_IS_INTEGRAL(long)
-DEF_IS_INTEGRAL(unsigned long)
-DEF_IS_INTEGRAL(ap_slong)
-DEF_IS_INTEGRAL(ap_ulong)
-#undef DEF_IS_INTEGRAL
-
-template <bool, typename _Tp = void>
-struct enable_if {};
-// partial specialization for true
-template <typename _Tp>
-struct enable_if<true, _Tp> {
-  typedef _Tp type;
-};
-
-template <typename _Tp>
-struct remove_const {
-  typedef _Tp type;
-};
-
-template <typename _Tp>
-struct remove_const<_Tp const> {
-  typedef _Tp type;
-};
-} // namespace _ap_type
-
-// ----------------------------------------------------------------------
-
-// Define ssdm_int and _ssdm_op.
-// XXX deleted in open-source version
-
-#ifndef NON_C99STRING
-#define _AP_C99 true
-#else
-#define _AP_C99 false
-#endif
-
-static inline unsigned char guess_radix(const char* s) {
-  unsigned char rd = 10; ///< default radix
-  const char* p = s;
-  // skip neg sign if it exists
-  if (p[0] == '-' || p[0] == '+') ++p;
-  // guess based on following two bits.
-  if (p[0] == '0') {
-    if (p[1] == 'b' || p[1] == 'B') {
-      rd = 2;
-    } else if (p[1] == 'o' || p[1] == 'O') {
-      rd = 8;
-    } else if (p[1] == 'x' || p[1] == 'X') {
-      rd = 16;
-    } else if (p[1] == 'd' || p[1] == 'D') {
-      rd = 10;
-    }
-  }
-  return rd;
-}
-
-// ----------------------------------------------------------------------
-
-// Basic integral struct upon which ap_int and ap_fixed are defined.
-#ifdef __SYNTHESIS__
-// Use ssdm_int, a compiler dependent, attribute constrained integeral type as
-// basic data type.
-#define _AP_ROOT_TYPE ssdm_int
-// Basic ops.
-#define _AP_ROOT_op_concat(Ret, X, Y) _ssdm_op_concat(Ret, X, Y)
-#define _AP_ROOT_op_get_bit(Val, Bit) _ssdm_op_get_bit(Val, Bit)
-#define _AP_ROOT_op_set_bit(Val, Bit, Repl) _ssdm_op_set_bit(Val, Bit, Repl)
-#define _AP_ROOT_op_get_range(Val, Lo, Hi) _ssdm_op_get_range(Val, Lo, Hi)
-#define _AP_ROOT_op_set_range(Val, Lo, Hi, Repl) \
-  _ssdm_op_set_range(Val, Lo, Hi, Repl)
-#define _AP_ROOT_op_reduce(Op, Val) _ssdm_op_reduce(Op, Val)
-#else // ifdef __SYNTHESIS__
-// Use ap_private for compiler-independent basic data type
-template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
-class ap_private;
-/// model ssdm_int in standard C++ for simulation.
-template <int _AP_W, bool _AP_S>
-struct ssdm_int_sim {
-  /// integral type with template-specified width and signedness.
-  ap_private<_AP_W, _AP_S> V;
-  ssdm_int_sim() {}
-};
-#define _AP_ROOT_TYPE ssdm_int_sim
-// private's ref uses _AP_ROOT_TYPE.
-#include <etc/ap_private.h>
-// XXX The C-sim model cannot use GCC-extension
-// Basic ops. Ret and Val are ap_private.
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1 _AP_ROOT_op_concat(const _Tp1& Ret, const _Tp2& X, const _Tp3& Y) {
-  _Tp1 r = (X).operator,(Y);
-  return r;
-}
-#define _AP_ROOT_op_get_bit(Val, Bit) (Val).get_bit((Bit))
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1& _AP_ROOT_op_set_bit(_Tp1& Val, const _Tp2& Bit, const _Tp3& Repl) {
-  (Val).set_bit((Bit), (Repl));
-  return Val;
-}
-// notice the order of high and low index is different in ssdm call and
-// ap_private.range()...
-#define _AP_ROOT_op_get_range(Val, Lo, Hi) (Val).range((Hi), (Lo))
-template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
-inline _Tp1& _AP_ROOT_op_set_range(_Tp1& Val, const _Tp2& Lo, const _Tp3& Hi,
-                                   const _Tp4& Repl) {
-  (Val).range((Hi), (Lo)) = Repl;
-  return (Val);
-}
-#define _AP_ROOT_op_and_reduce(Val) (Val).and_reduce()
-#define _AP_ROOT_op_nand_reduce(Val) (Val).nand_reduce()
-#define _AP_ROOT_op_or_reduce(Val) (Val).or_reduce()
-#define _AP_ROOT_op_xor_reduce(Val) (Val).xor_reduce()
-// ## is the concatenation in preprocessor:
-#define _AP_ROOT_op_reduce(Op, Val) _AP_ROOT_op_##Op##_reduce(Val)
-#endif // ifdef __SYNTHESIS__ else
-
-// ----------------------------------------------------------------------
-
-// Constants for half, single, double pricision floating points
-#define HALF_MAN 10
-#define FLOAT_MAN 23
-#define DOUBLE_MAN 52
-
-#define HALF_EXP 5
-#define FLOAT_EXP 8
-#define DOUBLE_EXP 11
-
-#define BIAS(e) ((1L << (e - 1L)) - 1L)
-#define HALF_BIAS BIAS(HALF_EXP)
-#define FLOAT_BIAS BIAS(FLOAT_EXP)
-#define DOUBLE_BIAS BIAS(DOUBLE_EXP)
-
-#define APFX_IEEE_DOUBLE_E_MAX DOUBLE_BIAS
-#define APFX_IEEE_DOUBLE_E_MIN (-DOUBLE_BIAS + 1)
-
-INLINE ap_ulong doubleToRawBits(double pf) {
-  union {
-    ap_ulong __L;
-    double __D;
-  } LD;
-  LD.__D = pf;
-  return LD.__L;
-}
-
-INLINE unsigned int floatToRawBits(float pf) {
-  union {
-    unsigned int __L;
-    float __D;
-  } LD;
-  LD.__D = pf;
-  return LD.__L;
-}
-
-#if _AP_ENABLE_HALF_ == 1
-INLINE unsigned short halfToRawBits(half pf) {
-#ifdef __SYNTHESIS__
-  union {
-    unsigned short __L;
-    half __D;
-  } LD;
-  LD.__D = pf;
-  return LD.__L;
-#else
-  return pf.get_bits();
-#endif
-}
-#endif
-
-// usigned long long is at least 64-bit
-INLINE double rawBitsToDouble(ap_ulong pi) {
-  union {
-    ap_ulong __L;
-    double __D;
-  } LD;
-  LD.__L = pi;
-  return LD.__D;
-}
-
-// long is at least 32-bit
-INLINE float rawBitsToFloat(unsigned long pi) {
-  union {
-    unsigned int __L;
-    float __D;
-  } LD;
-  LD.__L = pi;
-  return LD.__D;
-}
-
-#if _AP_ENABLE_HALF_ == 1
-// short is at least 16-bit
-INLINE half rawBitsToHalf(unsigned short pi) {
-#ifdef __SYNTHESIS__
-  union {
-    unsigned short __L;
-    half __D;
-  } LD;
-  LD.__L = pi;
-  return LD.__D;
-#else
-  // sim model of half has a non-trivial constructor
-  half __D;
-  __D.set_bits(pi);
-  return __D;
-#endif
-}
-#endif
-
-#endif // ifndef __AP_COMMON_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_COMMON_H__
+#define __AP_COMMON_H__
+
+// ----------------------------------------------------------------------
+
+// Forward declaration of all AP types.
+#include <ap_decl.h>
+
+
+#ifdef __SYNTHESIS__
+#error "The open-source version of AP types does not support synthesis."
+#endif // ifdef __SYNTHESIS__
+#define _AP_ENABLE_HALF_ 0
+
+
+#if _AP_ENABLE_HALF_ == 1
+// Before ap_private definition.
+#ifdef __SYNTHESIS__
+#define _HLS_HALF_DEFINED_
+typedef __fp16 half;
+#else
+class half;
+#endif // __SYNTHESIS__
+#endif // _AP_ENABLE_HALF_
+
+// ----------------------------------------------------------------------
+
+// Macro functions
+#define AP_MAX(a, b) ((a) > (b) ? (a) : (b))
+#define AP_MIN(a, b) ((a) < (b) ? (a) : (b))
+#define AP_ABS(a) ((a) >= 0 ? (a) : -(a))
+
+#ifndef AP_ASSERT
+#ifndef __SYNTHESIS__
+#include <assert.h>
+#define AP_ASSERT(cond, msg) assert((cond) && (msg))
+#else
+#define AP_ASSERT(cond, msg)
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_ASSERT
+
+#ifndef __SYNTHESIS__
+// for fprintf messages.
+#include <stdio.h>
+// for exit on error.
+#include <stdlib.h>
+#endif
+
+// same disable condition as assert.
+#if !defined(__SYNTHESIS__) && !defined(NDEBUG)
+
+#define _AP_DEBUG(cond, ...)                  \
+  do {                                        \
+    if ((cond)) {                             \
+      fprintf(stderr, "DEBUG: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                  \
+    }                                         \
+  } while (0)
+#define _AP_WARNING(cond, ...)                  \
+  do {                                          \
+    if ((cond)) {                               \
+      fprintf(stderr, "WARNING: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                    \
+    }                                           \
+  } while (0)
+#define _AP_ERROR(cond, ...)                  \
+  do {                                        \
+    if ((cond)) {                             \
+      fprintf(stderr, "ERROR: " __VA_ARGS__); \
+      fprintf(stderr, "\n");                  \
+      abort();                                \
+    }                                         \
+  } while (0)
+
+#else // if !defined(__SYNTHESIS__) && !defined(NDEBUG)
+
+#define __AP_VOID_CAST static_cast<void>
+#define _AP_DEBUG(cond, ...) (__AP_VOID_CAST(0))
+#define _AP_WARNING(cond, ...) (__AP_VOID_CAST(0))
+#define _AP_ERROR(cond, ...) (__AP_VOID_CAST(0))
+
+#endif // if !defined(__SYNTHESIS__) && !defined(NDEBUG) else
+
+// ----------------------------------------------------------------------
+
+// Attribute only for synthesis
+#ifdef __SYNTHESIS__
+#define INLINE inline __attribute__((always_inline))
+//#define INLINE inline __attribute__((noinline))
+#else
+#define INLINE inline
+#endif
+
+#define AP_WEAK
+// __attribute__((weak))
+
+#ifndef AP_INT_MAX_W
+#define AP_INT_MAX_W 1024
+#endif
+
+#define BIT_WIDTH_UPPER_LIMIT (1 << 15)
+#if AP_INT_MAX_W > BIT_WIDTH_UPPER_LIMIT
+#error "Bitwidth exceeds 32768 (1 << 15), the maximum allowed value"
+#endif
+
+#define MAX_MODE(BITS) ((BITS + 1023) / 1024)
+
+// ----------------------------------------------------------------------
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+// for overload operator<<
+#include <iostream>
+#endif
+#endif // ifndef AP_AUTOCC
+
+#ifndef __SYNTHESIS__
+// for string format.
+#include <sstream>
+// for string.
+#include <string>
+#endif
+
+// for detecting if char is signed.
+enum { CHAR_IS_SIGNED = (char)-1 < 0 };
+
+// TODO we have similar traits in x_hls_utils.h, should consider unify.
+namespace _ap_type {
+template <typename _Tp>
+struct is_signed {
+  static const bool value = _Tp(-1) < _Tp(1);
+};
+
+template <typename _Tp>
+struct is_integral {
+  static const bool value = false;
+};
+#define DEF_IS_INTEGRAL(CTYPE)      \
+  template <>                       \
+  struct is_integral<CTYPE> {       \
+    static const bool value = true; \
+  };
+DEF_IS_INTEGRAL(bool)
+DEF_IS_INTEGRAL(char)
+DEF_IS_INTEGRAL(signed char)
+DEF_IS_INTEGRAL(unsigned char)
+DEF_IS_INTEGRAL(short)
+DEF_IS_INTEGRAL(unsigned short)
+DEF_IS_INTEGRAL(int)
+DEF_IS_INTEGRAL(unsigned int)
+DEF_IS_INTEGRAL(long)
+DEF_IS_INTEGRAL(unsigned long)
+DEF_IS_INTEGRAL(ap_slong)
+DEF_IS_INTEGRAL(ap_ulong)
+#undef DEF_IS_INTEGRAL
+
+template <bool, typename _Tp = void>
+struct enable_if {};
+// partial specialization for true
+template <typename _Tp>
+struct enable_if<true, _Tp> {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_const {
+  typedef _Tp type;
+};
+
+template <typename _Tp>
+struct remove_const<_Tp const> {
+  typedef _Tp type;
+};
+} // namespace _ap_type
+
+// ----------------------------------------------------------------------
+
+// Define ssdm_int and _ssdm_op.
+// XXX deleted in open-source version
+
+#ifndef NON_C99STRING
+#define _AP_C99 true
+#else
+#define _AP_C99 false
+#endif
+
+static inline unsigned char guess_radix(const char* s) {
+  unsigned char rd = 10; ///< default radix
+  const char* p = s;
+  // skip neg sign if it exists
+  if (p[0] == '-' || p[0] == '+') ++p;
+  // guess based on following two bits.
+  if (p[0] == '0') {
+    if (p[1] == 'b' || p[1] == 'B') {
+      rd = 2;
+    } else if (p[1] == 'o' || p[1] == 'O') {
+      rd = 8;
+    } else if (p[1] == 'x' || p[1] == 'X') {
+      rd = 16;
+    } else if (p[1] == 'd' || p[1] == 'D') {
+      rd = 10;
+    }
+  }
+  return rd;
+}
+
+// ----------------------------------------------------------------------
+
+// Basic integral struct upon which ap_int and ap_fixed are defined.
+#ifdef __SYNTHESIS__
+// Use ssdm_int, a compiler dependent, attribute constrained integeral type as
+// basic data type.
+#define _AP_ROOT_TYPE ssdm_int
+// Basic ops.
+#define _AP_ROOT_op_concat(Ret, X, Y) _ssdm_op_concat(Ret, X, Y)
+#define _AP_ROOT_op_get_bit(Val, Bit) _ssdm_op_get_bit(Val, Bit)
+#define _AP_ROOT_op_set_bit(Val, Bit, Repl) _ssdm_op_set_bit(Val, Bit, Repl)
+#define _AP_ROOT_op_get_range(Val, Lo, Hi) _ssdm_op_get_range(Val, Lo, Hi)
+#define _AP_ROOT_op_set_range(Val, Lo, Hi, Repl) \
+  _ssdm_op_set_range(Val, Lo, Hi, Repl)
+#define _AP_ROOT_op_reduce(Op, Val) _ssdm_op_reduce(Op, Val)
+#else // ifdef __SYNTHESIS__
+// Use ap_private for compiler-independent basic data type
+template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
+class ap_private;
+/// model ssdm_int in standard C++ for simulation.
+template <int _AP_W, bool _AP_S>
+struct ssdm_int_sim {
+  /// integral type with template-specified width and signedness.
+  ap_private<_AP_W, _AP_S> V;
+  ssdm_int_sim() {}
+};
+#define _AP_ROOT_TYPE ssdm_int_sim
+// private's ref uses _AP_ROOT_TYPE.
+#include <etc/ap_private.h>
+// XXX The C-sim model cannot use GCC-extension
+// Basic ops. Ret and Val are ap_private.
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ROOT_op_concat(const _Tp1& Ret, const _Tp2& X, const _Tp3& Y) {
+  _Tp1 r = (X).operator,(Y);
+  return r;
+}
+#define _AP_ROOT_op_get_bit(Val, Bit) (Val).get_bit((Bit))
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1& _AP_ROOT_op_set_bit(_Tp1& Val, const _Tp2& Bit, const _Tp3& Repl) {
+  (Val).set_bit((Bit), (Repl));
+  return Val;
+}
+// notice the order of high and low index is different in ssdm call and
+// ap_private.range()...
+#define _AP_ROOT_op_get_range(Val, Lo, Hi) (Val).range((Hi), (Lo))
+template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+inline _Tp1& _AP_ROOT_op_set_range(_Tp1& Val, const _Tp2& Lo, const _Tp3& Hi,
+                                   const _Tp4& Repl) {
+  (Val).range((Hi), (Lo)) = Repl;
+  return (Val);
+}
+#define _AP_ROOT_op_and_reduce(Val) (Val).and_reduce()
+#define _AP_ROOT_op_nand_reduce(Val) (Val).nand_reduce()
+#define _AP_ROOT_op_or_reduce(Val) (Val).or_reduce()
+#define _AP_ROOT_op_xor_reduce(Val) (Val).xor_reduce()
+// ## is the concatenation in preprocessor:
+#define _AP_ROOT_op_reduce(Op, Val) _AP_ROOT_op_##Op##_reduce(Val)
+#endif // ifdef __SYNTHESIS__ else
+
+// ----------------------------------------------------------------------
+
+// Constants for half, single, double pricision floating points
+#define HALF_MAN 10
+#define FLOAT_MAN 23
+#define DOUBLE_MAN 52
+
+#define HALF_EXP 5
+#define FLOAT_EXP 8
+#define DOUBLE_EXP 11
+
+#define BIAS(e) ((1L << (e - 1L)) - 1L)
+#define HALF_BIAS BIAS(HALF_EXP)
+#define FLOAT_BIAS BIAS(FLOAT_EXP)
+#define DOUBLE_BIAS BIAS(DOUBLE_EXP)
+
+#define APFX_IEEE_DOUBLE_E_MAX DOUBLE_BIAS
+#define APFX_IEEE_DOUBLE_E_MIN (-DOUBLE_BIAS + 1)
+
+INLINE ap_ulong doubleToRawBits(double pf) {
+  union {
+    ap_ulong __L;
+    double __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+}
+
+INLINE unsigned int floatToRawBits(float pf) {
+  union {
+    unsigned int __L;
+    float __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+}
+
+#if _AP_ENABLE_HALF_ == 1
+INLINE unsigned short halfToRawBits(half pf) {
+#ifdef __SYNTHESIS__
+  union {
+    unsigned short __L;
+    half __D;
+  } LD;
+  LD.__D = pf;
+  return LD.__L;
+#else
+  return pf.get_bits();
+#endif
+}
+#endif
+
+// usigned long long is at least 64-bit
+INLINE double rawBitsToDouble(ap_ulong pi) {
+  union {
+    ap_ulong __L;
+    double __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+}
+
+// long is at least 32-bit
+INLINE float rawBitsToFloat(unsigned long pi) {
+  union {
+    unsigned int __L;
+    float __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+}
+
+#if _AP_ENABLE_HALF_ == 1
+// short is at least 16-bit
+INLINE half rawBitsToHalf(unsigned short pi) {
+#ifdef __SYNTHESIS__
+  union {
+    unsigned short __L;
+    half __D;
+  } LD;
+  LD.__L = pi;
+  return LD.__D;
+#else
+  // sim model of half has a non-trivial constructor
+  half __D;
+  __D.set_bits(pi);
+  return __D;
+#endif
+}
+#endif
+
+#endif // ifndef __AP_COMMON_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_decl.h b/hls4ml/templates/vivado/ap_types/ap_decl.h
index ddb8dd4a76..ddd00f1c7f 100644
--- a/hls4ml/templates/vivado/ap_types/ap_decl.h
+++ b/hls4ml/templates/vivado/ap_types/ap_decl.h
@@ -1,212 +1,212 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_DECL_H__
-#define __AP_DECL_H__
-
-// ----------------------------------------------------------------------
-
-#if !defined(__AP_FIXED_H__) && !defined(__AP_INT_H__) && !defined(__AUTOPILOT_CBE_H__) && !defined(__HLS_HALF_H__)
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-// Test __SYNTHESIS__ only for mode
-#if !defined(__SYNTHESIS__) && (defined(AESL_SYN) || defined(__HLS_SYN__))
-//#pragma message "AESL_SYN and __HLS_SYN__ should be replaced by __SYNTHESIS__"
-#define __SYNTHESIS__
-#endif
-
-/* for safety*/
-#if (defined(_AP_N) || defined(_AP_C))
-#error One or more of the following is defined: _AP_N, _AP_C. Definition conflicts with their usage as template parameters.
-#endif
-
-/* for safety*/
-#if (defined(_AP_W) || defined(_AP_I) || defined(_AP_S) || defined(_AP_Q) || \
-     defined(_AP_O) || defined(_AP_W2) || defined(_AP_I2) ||                 \
-     defined(_AP_S2) || defined(_AP_Q2) || defined(_AP_O2) ||                \
-     defined(_AP_N) || defined(_AP_N2))
-#error \
-    "One or more of the following is defined: _AP_W, _AP_I, _AP_S, _AP_Q, _AP_O,  _AP_N, _AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2. Definition conflicts with their usage as template parameters."
-#endif
-
-/*for safety*/
-#if (defined(_AP_W3) || defined(_AP_S3) || defined(_AP_W4) || defined(_AP_S4))
-#error \
-    "One or more of the following is defined: _AP_W3, _AP_S3, _AP_W4,_AP_S4. Definition conflicts with their usage as template parameters."
-#endif
-
-#if (defined(_AP_W1) || defined(_AP_S1) || defined(_AP_T) || \
-     defined(_AP_T1) || defined(_AP_T2) || defined(_AP_T3) || defined(_AP_T4))
-#error \
-    "One or more of the following is defined: _AP_W1, _AP_S1, _AP_T,  _AP_T1, _AP_T2, _AP_T3, _AP_T4. Definition conflicts with their usage as template parameters."
-#endif
-
-#ifndef __cplusplus
-#error "AP data type can only be used in C++"
-#endif
-
-// ----------------------------------------------------------------------
-
-#ifndef __SC_COMPATIBLE__
-/// ap_fixed quantification mode
-enum ap_q_mode {
-  AP_RND,         //< rounding to plus infinity
-  AP_RND_ZERO,    //< rounding to zero
-  AP_RND_MIN_INF, //< rounding to minus infinity
-  AP_RND_INF,     //< rounding to infinity
-  AP_RND_CONV,    //< convergent rounding
-  AP_TRN,         //< truncation
-  AP_TRN_ZERO,    //< truncation to zero
-};
-
-// FIXME for legacy code
-#ifndef SYSTEMC_INCLUDED
-#define SC_RND AP_RND
-#define SC_RND_ZERO AP_RND_ZERO
-#define SC_RND_MIN_INF AP_RND_MIN_INF
-#define SC_RND_INF AP_RND_INF
-#define SC_RND_CONV AP_RND_CONV
-#define SC_TRN AP_TRN
-#define SC_TRN_ZERO AP_TRN_ZERO
-#endif // !defined(SYSTEMC_INCLUDED)
-
-/// ap_fixed saturation mode
-enum ap_o_mode {
-  AP_SAT,      //< saturation
-  AP_SAT_ZERO, //< saturation to zero
-  AP_SAT_SYM,  //< symmetrical saturation
-  AP_WRAP,     //< wrap-around (*)
-  AP_WRAP_SM,  //< sign magnitude wrap-around (*)
-};
-
-// FIXME for legacy code
-#ifndef SYSTEMC_INCLUDED
-#define SC_SAT AP_SAT
-#define SC_SAT_ZERO AP_SAT_ZERO
-#define SC_SAT_SYM AP_SAT_SYM
-#define SC_WRAP AP_WRAP
-#define SC_WRAP_SM AP_WRAP_SM
-#endif // !defined(SYSTEMC_INCLUDED)
-
-#else // defined(__SC_COMPATIBLE__)
-
-// There will not be sc_fxdefs.h, and the emu should be defined by ap_fixed.
-
-/// ap_fixed quantification mode
-enum ap_q_mode {
-  SC_RND,         //< rounding to plus infinity
-  SC_RND_ZERO,    //< rounding to zero
-  SC_RND_MIN_INF, //< rounding to minus infinity
-  SC_RND_INF,     //< rounding to infinity
-  SC_RND_CONV,    //< convergent rounding
-  SC_TRN,         //< truncation
-  SC_TRN_ZERO,    //< truncation to zero
-};
-
-#define AP_RND SC_RND
-#define AP_RND_ZERO SC_RND_ZERO
-#define AP_RND_MIN_INF SC_RND_MIN_INF
-#define AP_RND_INF SC_RND_INF
-#define AP_RND_CONV SC_RND_CONV
-#define AP_TRN SC_TRN
-#define AP_TRN_ZERO SC_TRN_ZERO
-
-/// ap_fixed saturation mode
-enum ap_o_mode {
-  SC_SAT,      //< saturation
-  SC_SAT_ZERO, //< saturation to zero
-  SC_SAT_SYM,  //< symmetrical saturation
-  SC_WRAP,     //< wrap-around (*)
-  SC_WRAP_SM,  //< sign magnitude wrap-around (*)
-};
-
-#define AP_SAT SC_SAT
-#define AP_SAT_ZERO SC_SAT_ZERO
-#define AP_SAT_SYM SC_SAT_SYM
-#define AP_WRAP SC_WRAP
-#define AP_WRAP_SM SC_WRAP_SM
-
-#endif // defined(__SC_COMPATIBLE__)
-
-template <int _AP_W, bool _AP_S>
-struct ap_int_base;
-
-template <int _AP_W>
-struct ap_int;
-
-template <int _AP_W>
-struct ap_uint;
-
-template <int _AP_W, bool _AP_S>
-struct ap_range_ref;
-
-template <int _AP_W, bool _AP_S>
-struct ap_bit_ref;
-
-template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
-struct ap_concat_ref;
-
-template <int _AP_W, int _AP_I, bool _AP_S = true, ap_q_mode _AP_Q = AP_TRN,
-          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
-struct ap_fixed_base;
-
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
-          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
-struct ap_fixed;
-
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
-          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
-struct ap_ufixed;
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_range_ref;
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_bit_ref;
-
-/// string base mode
-enum BaseMode { AP_BIN = 2, AP_OCT = 8, AP_DEC = 10, AP_HEX = 16 };
-
-#ifndef SYSTEMC_INCLUDED
-#define SC_BIN 2
-#define SC_OCT 8
-#define SC_DEC 10
-#define SC_HEX 16
-#endif // !defined(SYSTEMC_INCLUDED)
-
-// Alias C data types
-#ifdef _MSC_VER
-typedef signed __int64 ap_slong;
-typedef unsigned __int64 ap_ulong;
-#else  // !defined(_MSC_VER)
-typedef signed long long ap_slong;
-typedef unsigned long long ap_ulong;
-#endif // !defined(_MSC_VER)
-
-enum {
-  _AP_SIZE_char = 8,
-  _AP_SIZE_short = sizeof(short) * 8,
-  _AP_SIZE_int = sizeof(int) * 8,
-  _AP_SIZE_long = sizeof(long) * 8,
-  _AP_SIZE_ap_slong = sizeof(ap_slong) * 8
-};
-
-#endif // !defined(__AP_DECL_H__)
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_DECL_H__
+#define __AP_DECL_H__
+
+// ----------------------------------------------------------------------
+
+#if !defined(__AP_FIXED_H__) && !defined(__AP_INT_H__) && !defined(__AUTOPILOT_CBE_H__) && !defined(__HLS_HALF_H__)
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+// Test __SYNTHESIS__ only for mode
+#if !defined(__SYNTHESIS__) && (defined(AESL_SYN) || defined(__HLS_SYN__))
+//#pragma message "AESL_SYN and __HLS_SYN__ should be replaced by __SYNTHESIS__"
+#define __SYNTHESIS__
+#endif
+
+/* for safety*/
+#if (defined(_AP_N) || defined(_AP_C))
+#error One or more of the following is defined: _AP_N, _AP_C. Definition conflicts with their usage as template parameters.
+#endif
+
+/* for safety*/
+#if (defined(_AP_W) || defined(_AP_I) || defined(_AP_S) || defined(_AP_Q) || \
+     defined(_AP_O) || defined(_AP_W2) || defined(_AP_I2) ||                 \
+     defined(_AP_S2) || defined(_AP_Q2) || defined(_AP_O2) ||                \
+     defined(_AP_N) || defined(_AP_N2))
+#error \
+    "One or more of the following is defined: _AP_W, _AP_I, _AP_S, _AP_Q, _AP_O,  _AP_N, _AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2. Definition conflicts with their usage as template parameters."
+#endif
+
+/*for safety*/
+#if (defined(_AP_W3) || defined(_AP_S3) || defined(_AP_W4) || defined(_AP_S4))
+#error \
+    "One or more of the following is defined: _AP_W3, _AP_S3, _AP_W4,_AP_S4. Definition conflicts with their usage as template parameters."
+#endif
+
+#if (defined(_AP_W1) || defined(_AP_S1) || defined(_AP_T) || \
+     defined(_AP_T1) || defined(_AP_T2) || defined(_AP_T3) || defined(_AP_T4))
+#error \
+    "One or more of the following is defined: _AP_W1, _AP_S1, _AP_T,  _AP_T1, _AP_T2, _AP_T3, _AP_T4. Definition conflicts with their usage as template parameters."
+#endif
+
+#ifndef __cplusplus
+#error "AP data type can only be used in C++"
+#endif
+
+// ----------------------------------------------------------------------
+
+#ifndef __SC_COMPATIBLE__
+/// ap_fixed quantification mode
+enum ap_q_mode {
+  AP_RND,         //< rounding to plus infinity
+  AP_RND_ZERO,    //< rounding to zero
+  AP_RND_MIN_INF, //< rounding to minus infinity
+  AP_RND_INF,     //< rounding to infinity
+  AP_RND_CONV,    //< convergent rounding
+  AP_TRN,         //< truncation
+  AP_TRN_ZERO,    //< truncation to zero
+};
+
+// FIXME for legacy code
+#ifndef SYSTEMC_INCLUDED
+#define SC_RND AP_RND
+#define SC_RND_ZERO AP_RND_ZERO
+#define SC_RND_MIN_INF AP_RND_MIN_INF
+#define SC_RND_INF AP_RND_INF
+#define SC_RND_CONV AP_RND_CONV
+#define SC_TRN AP_TRN
+#define SC_TRN_ZERO AP_TRN_ZERO
+#endif // !defined(SYSTEMC_INCLUDED)
+
+/// ap_fixed saturation mode
+enum ap_o_mode {
+  AP_SAT,      //< saturation
+  AP_SAT_ZERO, //< saturation to zero
+  AP_SAT_SYM,  //< symmetrical saturation
+  AP_WRAP,     //< wrap-around (*)
+  AP_WRAP_SM,  //< sign magnitude wrap-around (*)
+};
+
+// FIXME for legacy code
+#ifndef SYSTEMC_INCLUDED
+#define SC_SAT AP_SAT
+#define SC_SAT_ZERO AP_SAT_ZERO
+#define SC_SAT_SYM AP_SAT_SYM
+#define SC_WRAP AP_WRAP
+#define SC_WRAP_SM AP_WRAP_SM
+#endif // !defined(SYSTEMC_INCLUDED)
+
+#else // defined(__SC_COMPATIBLE__)
+
+// There will not be sc_fxdefs.h, and the emu should be defined by ap_fixed.
+
+/// ap_fixed quantification mode
+enum ap_q_mode {
+  SC_RND,         //< rounding to plus infinity
+  SC_RND_ZERO,    //< rounding to zero
+  SC_RND_MIN_INF, //< rounding to minus infinity
+  SC_RND_INF,     //< rounding to infinity
+  SC_RND_CONV,    //< convergent rounding
+  SC_TRN,         //< truncation
+  SC_TRN_ZERO,    //< truncation to zero
+};
+
+#define AP_RND SC_RND
+#define AP_RND_ZERO SC_RND_ZERO
+#define AP_RND_MIN_INF SC_RND_MIN_INF
+#define AP_RND_INF SC_RND_INF
+#define AP_RND_CONV SC_RND_CONV
+#define AP_TRN SC_TRN
+#define AP_TRN_ZERO SC_TRN_ZERO
+
+/// ap_fixed saturation mode
+enum ap_o_mode {
+  SC_SAT,      //< saturation
+  SC_SAT_ZERO, //< saturation to zero
+  SC_SAT_SYM,  //< symmetrical saturation
+  SC_WRAP,     //< wrap-around (*)
+  SC_WRAP_SM,  //< sign magnitude wrap-around (*)
+};
+
+#define AP_SAT SC_SAT
+#define AP_SAT_ZERO SC_SAT_ZERO
+#define AP_SAT_SYM SC_SAT_SYM
+#define AP_WRAP SC_WRAP
+#define AP_WRAP_SM SC_WRAP_SM
+
+#endif // defined(__SC_COMPATIBLE__)
+
+template <int _AP_W, bool _AP_S>
+struct ap_int_base;
+
+template <int _AP_W>
+struct ap_int;
+
+template <int _AP_W>
+struct ap_uint;
+
+template <int _AP_W, bool _AP_S>
+struct ap_range_ref;
+
+template <int _AP_W, bool _AP_S>
+struct ap_bit_ref;
+
+template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+struct ap_concat_ref;
+
+template <int _AP_W, int _AP_I, bool _AP_S = true, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_fixed_base;
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_fixed;
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q = AP_TRN,
+          ap_o_mode _AP_O = AP_WRAP, int _AP_N = 0>
+struct ap_ufixed;
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_range_ref;
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_bit_ref;
+
+/// string base mode
+enum BaseMode { AP_BIN = 2, AP_OCT = 8, AP_DEC = 10, AP_HEX = 16 };
+
+#ifndef SYSTEMC_INCLUDED
+#define SC_BIN 2
+#define SC_OCT 8
+#define SC_DEC 10
+#define SC_HEX 16
+#endif // !defined(SYSTEMC_INCLUDED)
+
+// Alias C data types
+#ifdef _MSC_VER
+typedef signed __int64 ap_slong;
+typedef unsigned __int64 ap_ulong;
+#else  // !defined(_MSC_VER)
+typedef signed long long ap_slong;
+typedef unsigned long long ap_ulong;
+#endif // !defined(_MSC_VER)
+
+enum {
+  _AP_SIZE_char = 8,
+  _AP_SIZE_short = sizeof(short) * 8,
+  _AP_SIZE_int = sizeof(int) * 8,
+  _AP_SIZE_long = sizeof(long) * 8,
+  _AP_SIZE_ap_slong = sizeof(ap_slong) * 8
+};
+
+#endif // !defined(__AP_DECL_H__)
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed.h b/hls4ml/templates/vivado/ap_types/ap_fixed.h
index a25913a3c8..cd0192bcb9 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed.h
@@ -1,360 +1,360 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_H__
-#define __AP_FIXED_H__
-
-#include <ap_common.h>
-#include <ap_fixed_base.h>
-#include <ap_fixed_ref.h>
-
-//---------------------------------------------------------------
-
-/// Signed Arbitrary Precision Fixed-Point Type.
-// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-struct ap_fixed : ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> {
-  typedef ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> Base;
-  // Constructor
-  /// default ctor
-  INLINE ap_fixed() : Base() {}
-
-  /// default copy ctor
-  INLINE ap_fixed(const ap_fixed& op) { Base::V = op.V; }
-
-  /// copy ctor from ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                      _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                               _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  //// from ap_fixed
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //// from ap_ufixed.
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_fixed(
-  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  /// copy ctor from ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  //// from ap_int.
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const volatile ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //// from ap_uint.
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_fixed(const volatile ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  // from ap_bit_ref.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  // from ap_range_ref.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  // from ap_concat_ref.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_fixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
-      : Base(op) {}
-
-  // from af_bit_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  // from af_range_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-// from c types.
-#define CTOR(TYPE) \
-  INLINE ap_fixed(TYPE v) : Base(v) {}
-
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  CTOR(half)
-#endif
-  CTOR(float)
-  CTOR(double)
-#undef CTOR
-
-  INLINE ap_fixed(const char* s) : Base(s) {}
-
-  INLINE ap_fixed(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  // The assignment operator is technically inherited; however, it is always
-  // hidden by an explicitly or implicitly defined assignment operator for the
-  // derived class.
-  /* XXX ctor will be used when right is not of proper type. */
-  INLINE ap_fixed& operator=(
-      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(
-      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-
-  INLINE ap_fixed& operator=(
-      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(
-      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-}; // struct ap_fixed.
-
-//-------------------------------------------------------------------
-
-// Unsigned Arbitrary Precision Fixed-Point Type.
-// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-struct ap_ufixed : ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> {
-  typedef ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> Base;
-  // Constructor
-  /// default ctor
-  INLINE ap_ufixed() : Base() {}
-
-  /// default copy ctor
-  INLINE ap_ufixed(const ap_ufixed& op) { Base::V = op.V; }
-
-  /// copy ctor from ap_fixed_base
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                       _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  /// copy ctor from ap_fixed_base
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
-                                                _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-  //          int _AP_N2>
-  //INLINE ap_ufixed(
-  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
-  //}
-
-  /// copy ctor from ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const volatile ap_int<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  //template <int _AP_W2>
-  //INLINE ap_ufixed(const volatile ap_uint<_AP_W2>& op)
-  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_ufixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_ufixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_ufixed(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-#define CTOR(TYPE) \
-  INLINE ap_ufixed(TYPE v) : Base(v) {}
-
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  CTOR(half)
-#endif
-  CTOR(float)
-  CTOR(double)
-#undef CTOR
-
-  INLINE ap_ufixed(const char* s) : Base(s) {}
-
-  INLINE ap_ufixed(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  INLINE ap_ufixed& operator=(
-      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(
-      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-
-  INLINE ap_ufixed& operator=(
-      const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-
-  INLINE void operator=(const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O,
-                                                 _AP_N>& op) volatile {
-    Base::V = op.V;
-  }
-}; // struct ap_ufixed
-
-
-#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
-// XXX sc_trace overload for ap_fixed is already included in
-// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-INLINE void sc_trace(sc_core::sc_trace_file* tf,
-                     const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
-                     const std::string& name) {
-  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-INLINE void sc_trace(sc_core::sc_trace_file* tf,
-                     const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
-                     const std::string& name) {
-  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-#endif // System C sim
-
-// Specialization of std containers, so that std::complex<ap_fixed> can have its
-// image part automatically zero-initialized when only real part is provided.
-#include <ap_fixed_special.h>
-
-#endif // ifndef __AP_FIXED_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_H__
+#define __AP_FIXED_H__
+
+#include <ap_common.h>
+#include <ap_fixed_base.h>
+#include <ap_fixed_ref.h>
+
+//---------------------------------------------------------------
+
+/// Signed Arbitrary Precision Fixed-Point Type.
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_fixed : ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> {
+  typedef ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> Base;
+  // Constructor
+  /// default ctor
+  INLINE ap_fixed() : Base() {}
+
+  /// default copy ctor
+  INLINE ap_fixed(const ap_fixed& op) { Base::V = op.V; }
+
+  /// copy ctor from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                      _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                               _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  //// from ap_fixed
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //// from ap_ufixed.
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_fixed(
+  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  /// copy ctor from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  //// from ap_int.
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const volatile ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //// from ap_uint.
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_fixed(const volatile ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  // from ap_bit_ref.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  // from ap_range_ref.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  // from ap_concat_ref.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_fixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
+      : Base(op) {}
+
+  // from af_bit_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  // from af_range_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+// from c types.
+#define CTOR(TYPE) \
+  INLINE ap_fixed(TYPE v) : Base(v) {}
+
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  CTOR(half)
+#endif
+  CTOR(float)
+  CTOR(double)
+#undef CTOR
+
+  INLINE ap_fixed(const char* s) : Base(s) {}
+
+  INLINE ap_fixed(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  // The assignment operator is technically inherited; however, it is always
+  // hidden by an explicitly or implicitly defined assignment operator for the
+  // derived class.
+  /* XXX ctor will be used when right is not of proper type. */
+  INLINE ap_fixed& operator=(
+      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+
+  INLINE ap_fixed& operator=(
+      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const volatile ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+}; // struct ap_fixed.
+
+//-------------------------------------------------------------------
+
+// Unsigned Arbitrary Precision Fixed-Point Type.
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_ufixed : ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> {
+  typedef ap_fixed_base<_AP_W, _AP_I, false, _AP_Q, _AP_O, _AP_N> Base;
+  // Constructor
+  /// default ctor
+  INLINE ap_ufixed() : Base() {}
+
+  /// default copy ctor
+  INLINE ap_ufixed(const ap_ufixed& op) { Base::V = op.V; }
+
+  /// copy ctor from ap_fixed_base
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                       _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  /// copy ctor from ap_fixed_base
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2,
+                                                _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>(op)) {}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  //template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+  //          int _AP_N2>
+  //INLINE ap_ufixed(
+  //    const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+  //    : Base(ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>(op)) {
+  //}
+
+  /// copy ctor from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const volatile ap_int_base<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const volatile ap_int<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, true>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  //template <int _AP_W2>
+  //INLINE ap_ufixed(const volatile ap_uint<_AP_W2>& op)
+  //    : Base(ap_int_base<_AP_W2, false>(op)) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_bit_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_ufixed(const ap_range_ref<_AP_W2, _AP_S2>& op) : Base(op) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_ufixed(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_ufixed(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_ufixed(TYPE v) : Base(v) {}
+
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  CTOR(half)
+#endif
+  CTOR(float)
+  CTOR(double)
+#undef CTOR
+
+  INLINE ap_ufixed(const char* s) : Base(s) {}
+
+  INLINE ap_ufixed(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  INLINE ap_ufixed& operator=(
+      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(
+      const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+
+  INLINE ap_ufixed& operator=(
+      const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+
+  INLINE void operator=(const volatile ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O,
+                                                 _AP_N>& op) volatile {
+    Base::V = op.V;
+  }
+}; // struct ap_ufixed
+
+
+#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
+// XXX sc_trace overload for ap_fixed is already included in
+// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+INLINE void sc_trace(sc_core::sc_trace_file* tf,
+                     const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
+                     const std::string& name) {
+  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+INLINE void sc_trace(sc_core::sc_trace_file* tf,
+                     const ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>& op,
+                     const std::string& name) {
+  tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+#endif // System C sim
+
+// Specialization of std containers, so that std::complex<ap_fixed> can have its
+// image part automatically zero-initialized when only real part is provided.
+#include <ap_fixed_special.h>
+
+#endif // ifndef __AP_FIXED_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed_base.h b/hls4ml/templates/vivado/ap_types/ap_fixed_base.h
index 216f9772e5..1d94b938f0 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed_base.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed_base.h
@@ -1,2354 +1,2354 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_BASE_H__
-#define __AP_FIXED_BASE_H__
-
-#ifndef __AP_FIXED_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-// for ap_int_base and its reference types.
-#include <ap_int.h>
-#ifndef __SYNTHESIS__
-#if _AP_ENABLE_HALF_ == 1
-// for half type
-#include <hls_half.h>
-#endif
-// for std io
-#include <iostream>
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-#else // __cplusplus
-
-// for warning on unsupported rounding mode in conversion to float/double.
-#if !defined(__SYNTHESIS__) && __cplusplus >= 201103L && \
-    (defined(__gnu_linux__) || defined(_WIN32))
-#define AP_FIXED_ENABLE_CPP_FENV 1
-#include <cfenv>
-#endif
-
-// ----------------------------------------------------------------------
-
-/* Major TODO
-  long double support: constructor, assign and other operators.
-  binary operators with ap_fixed_base and const char*.
-  return ap_fixed/ap_ufixed when result signedness is known.
-*/
-
-// Helper function in conversion to floating point types.
-
-#ifdef __SYNTHESIS__
-#define _AP_ctype_op_get_bit(var, index) _AP_ROOT_op_get_bit(var, index)
-#define _AP_ctype_op_set_bit(var, index, x) _AP_ROOT_op_set_bit(var, index, x)
-#define _AP_ctype_op_get_range(var, low, high) \
-  _AP_ROOT_op_get_range(var, low, high)
-#define _AP_ctype_op_set_range(var, low, high, x) \
-  _AP_ROOT_op_set_range(var, low, high, x)
-#else // ifdef __SYNTHESIS__
-template <typename _Tp1, typename _Tp2>
-inline bool _AP_ctype_op_get_bit(_Tp1& var, const _Tp2& index) {
-  return !!(var & (1ull << (index)));
-}
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1 _AP_ctype_op_set_bit(_Tp1& var, const _Tp2& index, const _Tp3& x) {
-  var |= (((x) ? 1ull : 0ull) << (index));
-  return var;
-}
-template <typename _Tp1, typename _Tp2, typename _Tp3>
-inline _Tp1 _AP_ctype_op_get_range(_Tp1& var, const _Tp2& low,
-                                   const _Tp3& high) {
-  _Tp1 r = var;
-  ap_ulong mask = -1ll;
-  mask >>= (sizeof(_Tp1) * 8 - ((high) - (low) + 1));
-  r >>= (low);
-  r &= mask;
-  return r;
-}
-template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
-inline _Tp1 _AP_ctype_op_set_range(_Tp1& var, const _Tp2& low, const _Tp3& high,
-                                   const _Tp4& x) {
-  ap_ulong mask = -1ll;
-  mask >>= (_AP_SIZE_ap_slong - ((high) - (low) + 1));
-  var &= ~(mask << (low));
-  var |= ((mask & x) << (low));
-  return var;
-}
-#endif // ifdef __SYNTHESIS__
-
-
-// trait for letting base class to return derived class.
-// Notice that derived class template is incomplete, and we cannot use
-// the member of the derived class.
-template <int _AP_W2, int _AP_I2, bool _AP_S2>
-struct _ap_fixed_factory;
-template <int _AP_W2, int _AP_I2>
-struct _ap_fixed_factory<_AP_W2, _AP_I2, true> {
-  typedef ap_fixed<_AP_W2, _AP_I2> type;
-};
-template <int _AP_W2, int _AP_I2>
-struct _ap_fixed_factory<_AP_W2, _AP_I2, false> {
-  typedef ap_ufixed<_AP_W2, _AP_I2> type;
-};
-
-/// ap_fixed_base: AutoPilot fixed point.
-/** partial specialization of signed.
-  @tparam _AP_W width.
-  @tparam _AP_I integral part width.
-  @tparam _AP_S signed.
-  @tparam _AP_Q quantization mode. Default is AP_TRN.
-  @tparam _AP_O saturation mode. Default is AP_WRAP.
-  @tparam _AP_N saturation wrap value. Default is 0.
- */
-// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct ap_fixed_base : _AP_ROOT_TYPE<_AP_W, _AP_S> {
- public:
-  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
-  static const int width = _AP_W;
-  static const int iwidth = _AP_I;
-  static const ap_q_mode qmode = _AP_Q;
-  static const ap_o_mode omode = _AP_O;
-
-  /// Return type trait.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2>
-  struct RType {
-    enum {
-      _AP_F = _AP_W - _AP_I,
-      F2 = _AP_W2 - _AP_I2,
-      mult_w = _AP_W + _AP_W2,
-      mult_i = _AP_I + _AP_I2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
-               1 + AP_MAX(_AP_F, F2),
-      plus_i =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1 +
-          AP_MAX(_AP_F, F2),
-      minus_i =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-#ifndef __SC_COMPATIBLE__
-      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0),
-#else
-      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
-#endif
-      div_i = _AP_S2 + _AP_I + F2,
-      div_s = _AP_S || _AP_S2,
-      logic_w =
-          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
-          AP_MAX(_AP_F, F2),
-      logic_i = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-
-    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> lhs;
-    typedef ap_fixed_base<_AP_W2, _AP_I2, _AP_S2> rhs;
-
-    typedef ap_fixed_base<mult_w, mult_i, mult_s> mult_base;
-    typedef ap_fixed_base<plus_w, plus_i, plus_s> plus_base;
-    typedef ap_fixed_base<minus_w, minus_i, minus_s> minus_base;
-    typedef ap_fixed_base<logic_w, logic_i, logic_s> logic_base;
-    typedef ap_fixed_base<div_w, div_i, div_s> div_base;
-    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> arg1_base;
-
-    typedef typename _ap_fixed_factory<mult_w, mult_i, mult_s>::type mult;
-    typedef typename _ap_fixed_factory<plus_w, plus_i, plus_s>::type plus;
-    typedef typename _ap_fixed_factory<minus_w, minus_i, minus_s>::type minus;
-    typedef typename _ap_fixed_factory<logic_w, logic_i, logic_s>::type logic;
-    typedef typename _ap_fixed_factory<div_w, div_i, div_s>::type div;
-    typedef typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type arg1;
-  };
-
- private:
-#ifndef __SYNTHESIS__
-  // This cannot handle hex float format string.
-  void fromString(const std::string& val, unsigned char radix) {
-    _AP_ERROR(!(radix == 2 || radix == 8 || radix == 10 || radix == 16),
-              "ap_fixed_base::fromString(%s, %d)", val.c_str(), radix);
-
-    Base::V = 0;
-    int startPos = 0;
-    int endPos = val.length();
-    int decPos = val.find(".");
-    if (decPos == -1) decPos = endPos;
-
-    // handle sign
-    bool isNegative = false;
-    if (val[0] == '-') {
-      isNegative = true;
-      ++startPos;
-    } else if (val[0] == '+')
-      ++startPos;
-
-    // If there are no integer bits, e.g.:
-    // .0000XXXX, then keep at least one bit.
-    // If the width is greater than the number of integer bits, e.g.:
-    // XXXX.XXXX, then we keep the integer bits
-    // if the number of integer bits is greater than the width, e.g.:
-    // XXX000 then we keep the integer bits.
-    // Always keep one bit.
-    ap_fixed_base<AP_MAX(_AP_I, 4) + 4, AP_MAX(_AP_I, 4) + 4, false>
-        integer_bits = 0;
-
-    // Figure out if we can shift instead of multiply
-    unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
-
-    //std::cout << "\n\n" << val << "\n";
-    //std::cout << startPos << " " << decPos << " " << endPos << "\n";
-
-    bool sticky_int = false;
-
-    // Traverse the integer digits from the MSD, multiplying by radix as we go.
-    for (int i = startPos; i < decPos; i++) {
-      // Get a digit
-      char cdigit = val[i];
-      if (cdigit == '\0') continue;
-      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
-
-      sticky_int |= integer_bits[AP_MAX(_AP_I, 4) + 4 - 1] |
-                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 2] |
-                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] |
-                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 4];
-      // Shift or multiply the value by the radix
-      if (shift)
-        integer_bits <<= shift;
-      else
-        integer_bits *= radix;
-
-      // Add in the digit we just interpreted
-      integer_bits += digit;
-      //std::cout << "idigit = " << digit << " " << integer_bits.to_string()
-      //    << "  " << sticky_int <<  "\n";
-    }
-    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] =
-        integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] | sticky_int;
-
-    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 0) + 4 + 4, 4, false> fractional_bits = 0;
-    bool sticky = false;
-
-    // Traverse the fractional digits from the LSD, dividing by radix as we go.
-    for (int i = endPos - 1; i >= decPos + 1; i--) {
-      // Get a digit
-      char cdigit = val[i];
-      if (cdigit == '\0') continue;
-      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
-      // Add in the digit we just interpreted
-      fractional_bits += digit;
-
-      sticky |= fractional_bits[0] | fractional_bits[1] | fractional_bits[2] |
-                fractional_bits[3];
-      // Shift or divide the value by the radix
-      if (shift)
-        fractional_bits >>= shift;
-      else
-        fractional_bits /= radix;
-
-      //std::cout << "fdigit = " << digit << " " << fractional_bits.to_string()
-      //    << " " << sticky << "\n";
-    }
-
-    //std::cout << "Int =" << integer_bits.to_string() << " " <<
-    //    fractional_bits.to_string() << "\n";
-
-    fractional_bits[0] = fractional_bits[0] | sticky;
-
-    if (isNegative)
-      *this = -(integer_bits + fractional_bits);
-    else
-      *this = integer_bits + fractional_bits;
-
-    //std::cout << "end = " << this->to_string(16) << "\n";
-  }
-
-  /// report invalid constrction of ap_fixed_base
-  INLINE void report() {
-    if (!_AP_S && _AP_O == AP_WRAP_SM) {
-      fprintf(stderr, "ap_ufxied<...> cannot support AP_WRAP_SM.\n");
-      exit(1);
-    }
-    if (_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024) {
-      fprintf(stderr,
-              "[E] ap_%sfixed<%d, ...>: Bitwidth exceeds the "
-              "default max value %d. Please use macro "
-              "AP_INT_MAX_W to set a larger max value.\n",
-              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
-      exit(1);
-    }
-  }
-#else
-  INLINE void report() {}
-#endif // ifdef __SYNTHESIS__
-
-  /// @name helper functions.
-  //  @{
-  INLINE void overflow_adjust(bool underflow, bool overflow, bool lD,
-                              bool sign) {
-    if (!underflow && !overflow) return;
-    if (_AP_O == AP_WRAP) {
-      if (_AP_N == 0) return;
-      if (_AP_S) {
-        // signed AP_WRAP
-        // n_bits == 1
-        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
-        if (_AP_N > 1) {
-          // n_bits > 1
-          ap_int_base<_AP_W, false> mask(-1);
-          if (sign) mask.V = 0;
-          Base::V =
-              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
-        }
-      } else {
-        // unsigned AP_WRAP
-        ap_int_base<_AP_W, false> mask(-1);
-        Base::V =
-            _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 1, mask.V);
-      }
-    } else if (_AP_O == AP_SAT_ZERO) {
-      Base::V = 0;
-    } else if (_AP_O == AP_WRAP_SM && _AP_S) {
-      bool Ro = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-      if (_AP_N == 0) {
-        if (lD != Ro) {
-          Base::V = ~Base::V;
-          Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, lD);
-        }
-      } else {
-        if (_AP_N == 1 && sign != Ro) {
-          Base::V = ~Base::V;
-        } else if (_AP_N > 1) {
-          bool lNo = _AP_ROOT_op_get_bit(Base::V, _AP_W - _AP_N);
-          if (lNo == sign) Base::V = ~Base::V;
-          ap_int_base<_AP_W, false> mask(-1);
-          if (sign) mask.V = 0;
-          Base::V =
-              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
-        }
-        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
-      }
-    } else {
-      if (_AP_S) {
-        if (overflow) {
-          Base::V = 1;
-          Base::V <<= _AP_W - 1;
-          Base::V = ~Base::V;
-        } else if (underflow) {
-          Base::V = 1;
-          Base::V <<= _AP_W - 1;
-          if (_AP_O == AP_SAT_SYM) Base::V |= 1;
-        }
-      } else {
-        if (overflow)
-          Base::V = ~(ap_int_base<_AP_W, false>(0).V);
-        else if (underflow)
-          Base::V = 0;
-      }
-    }
-  }
-
-  INLINE bool quantization_adjust(bool qb, bool r, bool s) {
-    bool carry = (bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-    if (_AP_Q == AP_TRN) return false;
-    if (_AP_Q == AP_RND_ZERO)
-      qb &= s || r;
-    else if (_AP_Q == AP_RND_MIN_INF)
-      qb &= r;
-    else if (_AP_Q == AP_RND_INF)
-      qb &= !s || r;
-    else if (_AP_Q == AP_RND_CONV)
-      qb &= _AP_ROOT_op_get_bit(Base::V, 0) || r;
-    else if (_AP_Q == AP_TRN_ZERO)
-      qb = s && (qb || r);
-    Base::V += qb;
-    return carry && (!(bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
-  }
-  //  @}
-
- public:
-  /// @name constructors.
-  //  @{
-  /// default ctor.
-  INLINE ap_fixed_base() {}
-
-  /// copy ctor.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    operator=(op);
-    report();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    operator=(op);
-    report();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
-    tmp.V = op.V;
-    operator=(tmp);
-    report();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
-    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
-    tmp.V = op.V;
-    operator=(tmp);
-    report();
-  }
-
-#ifndef __SYNTHESIS__
-#ifndef NON_C99STRING
-  INLINE ap_fixed_base(const char* s, signed char rd = 0) {
-    unsigned char radix = rd;
-    std::string str = ap_private_ops::parseString(s, radix); // will guess rd, default 10
-    _AP_ERROR(radix == 0, "ap_fixed_base(const char* \"%s\", %d), str=%s, radix = %d",
-              s, rd, str.c_str(), radix); // TODO remove this check
-    fromString(str, radix);
-  }
-#else
-  INLINE ap_fixed_base(const char* s, signed char rd = 10) {
-    ap_int_base<_AP_W, _AP_S> t(s, rd);
-    Base::V = t.V;
-  }
-#endif // ifndef NON_C99STRING
-#else // ifndef __SYNTHESIS__
-  // XXX _ssdm_string2bits only takes const string and const radix.
-  // It seems XFORM will do compile time processing of the string.
-  INLINE ap_fixed_base(const char* s) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_I, _AP_S, _AP_Q,
-                      _AP_O, _AP_N, _AP_C99);
-    Base::V = t;
-  }
-  INLINE ap_fixed_base(const char* s, signed char rd) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_I, _AP_S, _AP_Q,
-                      _AP_O, _AP_N, _AP_C99);
-    Base::V = t;
-  }
-#endif // ifndef __SYNTHESIS__ else
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
-    *this = ((bool)op);
-    report();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base(const ap_range_ref<_AP_W2, _AP_S2>& op) {
-    *this = (ap_int_base<_AP_W2, false>(op));
-    report();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_fixed_base(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) {
-    *this = (ap_int_base<_AP_W2 + _AP_W3, false>(op));
-    report();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    *this = (bool(op));
-    report();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    *this = (ap_int_base<_AP_W2, false>(op));
-    report();
-  }
-
-  // ctors from c types.
-  // make a temp ap_fixed_base first, and use ap_fixed_base.operator=
-#define CTOR_FROM_INT(C_TYPE, _AP_W2, _AP_S2)        \
-  INLINE ap_fixed_base(const C_TYPE x) {             \
-    ap_fixed_base<(_AP_W2), (_AP_W2), (_AP_S2)> tmp; \
-    tmp.V = x;                                       \
-    *this = tmp;                                     \
-  }
-
-  CTOR_FROM_INT(bool, 1, false)
-  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  CTOR_FROM_INT(signed char, 8, true)
-  CTOR_FROM_INT(unsigned char, 8, false)
-  CTOR_FROM_INT(short, _AP_SIZE_short, true)
-  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
-  CTOR_FROM_INT(int, _AP_SIZE_int, true)
-  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
-  CTOR_FROM_INT(long, _AP_SIZE_long, true)
-  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
-  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
-  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-#undef CTOR_FROM_INT
-/*
- * TODO:
- *Theere used to be several funtions which were AP_WEAK.
- *Now they're all INLINE expect ap_fixed_base(double d)
- *Maybe we can use '#pragma HLS inline' instead of INLINE.
- */
-  AP_WEAK ap_fixed_base(double d) {
-    ap_int_base<64, false> ireg;
-    ireg.V = doubleToRawBits(d);
-    bool isneg = _AP_ROOT_op_get_bit(ireg.V, 63);
-
-    ap_int_base<DOUBLE_EXP + 1, true> exp;
-    ap_int_base<DOUBLE_EXP, false> exp_tmp;
-    exp_tmp.V =
-        _AP_ROOT_op_get_range(ireg.V, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1);
-    exp = exp_tmp - DOUBLE_BIAS;
-    ap_int_base<DOUBLE_MAN + 2, true> man;
-    man.V = _AP_ROOT_op_get_range(ireg.V, 0, DOUBLE_MAN - 1);
-    // do not support NaN
-    _AP_WARNING(exp == APFX_IEEE_DOUBLE_E_MAX + 1 && man.V != 0,
-                "assign NaN to fixed point value");
-    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
-    if (isneg) man = -man;
-    if ((ireg.V & 0x7fffffffffffffffLL) == 0) {
-      Base::V = 0;
-    } else {
-      int _AP_W2 = DOUBLE_MAN + 2, _AP_I2 = exp.V + 2, _AP_F = _AP_W - _AP_I,
-          F2 = _AP_W2 - _AP_I2;
-      bool _AP_S2 = true,
-           QUAN_INC = F2 > _AP_F &&
-                      !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
-      bool carry = false;
-      // handle quantization
-      unsigned sh_amt = (F2 > _AP_F) ? F2 - _AP_F : _AP_F - F2;
-      if (F2 == _AP_F)
-        Base::V = man.V;
-      else if (F2 > _AP_F) {
-        if (sh_amt < DOUBLE_MAN + 2)
-          Base::V = man.V >> sh_amt;
-        else {
-          Base::V = isneg ? -1 : 0;
-        }
-        if ((_AP_Q != AP_TRN) && !((_AP_Q == AP_TRN_ZERO) && !_AP_S2)) {
-          bool qb = (F2 - _AP_F > _AP_W2) ? isneg : (bool)_AP_ROOT_op_get_bit(
-                                                        man.V, F2 - _AP_F - 1);
-          bool r =
-              (F2 > _AP_F + 1)
-                  ? _AP_ROOT_op_get_range(man.V, 0, (F2 - _AP_F - 2 < _AP_W2)
-                                                        ? (F2 - _AP_F - 2)
-                                                        : (_AP_W2 - 1)) != 0
-                  : false;
-          carry = quantization_adjust(qb, r, isneg);
-        }
-      } else { // no quantization
-        Base::V = man.V;
-        if (sh_amt < _AP_W)
-          Base::V = Base::V << sh_amt;
-        else
-          Base::V = 0;
-      }
-      // handle overflow/underflow
-      if ((_AP_O != AP_WRAP || _AP_N != 0) &&
-          ((!_AP_S && _AP_S2) ||
-           _AP_I - _AP_S <
-               _AP_I2 - _AP_S2 +
-                   (QUAN_INC ||
-                    (_AP_S2 && (_AP_O == AP_SAT_SYM))))) { // saturation
-        bool deleted_zeros = _AP_S2 ? true : !carry, deleted_ones = true;
-        bool neg_src = isneg;
-        bool lD = false;
-        int pos1 = F2 - _AP_F + _AP_W;
-        int pos2 = F2 - _AP_F + _AP_W + 1;
-        bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-        if (pos1 < _AP_W2 && pos1 >= 0)
-          // lD = _AP_ROOT_op_get_bit(man.V, pos1);
-          lD = (man.V >> pos1) & 1;
-        if (pos1 < _AP_W2) {
-          bool Range1_all_ones = true;
-          bool Range1_all_zeros = true;
-          bool Range2_all_ones = true;
-          ap_int_base<DOUBLE_MAN + 2, false> Range2;
-          ap_int_base<DOUBLE_MAN + 2, false> all_ones(-1);
-
-          if (pos2 >= 0 && pos2 < _AP_W2) {
-            // Range2.V = _AP_ROOT_op_get_range(man.V,
-            //                        pos2, _AP_W2 - 1);
-            Range2.V = man.V;
-            Range2.V >>= pos2;
-            Range2_all_ones = Range2 == (all_ones >> pos2);
-          } else if (pos2 < 0)
-            Range2_all_ones = false;
-          if (pos1 >= 0 && pos2 < _AP_W2) {
-            Range1_all_ones = Range2_all_ones && lD;
-            Range1_all_zeros = !Range2.V && !lD;
-          } else if (pos2 == _AP_W2) {
-            Range1_all_ones = lD;
-            Range1_all_zeros = !lD;
-          } else if (pos1 < 0) {
-            Range1_all_zeros = !man.V;
-            Range1_all_ones = false;
-          }
-
-          deleted_zeros =
-              deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
-          deleted_ones =
-              carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
-          neg_src = isneg && !(carry && Range1_all_ones);
-        } else
-          neg_src = isneg && newsignbit;
-        bool neg_trg = _AP_S && newsignbit;
-        bool overflow = (neg_trg || !deleted_zeros) && !isneg;
-        bool underflow = (!neg_trg || !deleted_ones) && neg_src;
-        if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
-          underflow |=
-              neg_src &&
-              (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
-                         : true);
-        overflow_adjust(underflow, overflow, lD, neg_src);
-      }
-    }
-    report();
-  }
-
-  // TODO more optimized implementation.
-  INLINE ap_fixed_base(float d) { *this = ap_fixed_base(double(d)); }
-
-#if _AP_ENABLE_HALF_ == 1
-  // TODO more optimized implementation.
-  INLINE ap_fixed_base(half d) { *this = ap_fixed_base(double(d)); }
-#endif
-  //  @}
-
-  /// @name assign operator
-  /// assign, using another ap_fixed_base of same template parameters.
-  /*
-  INLINE ap_fixed_base& operator=(
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
-    Base::V = op.V;
-    return *this;
-  }
-  */
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-
-    const int _AP_F = _AP_W - _AP_I;
-    const int F2 = _AP_W2 - _AP_I2;
-    const int QUAN_INC =
-          F2 > _AP_F && !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
-
-    if (!op) Base::V = 0;
-    bool carry = false;
-    bool signbit = _AP_ROOT_op_get_bit(op.V, _AP_W2 - 1);
-    bool isneg = signbit && _AP_S2;
-    if (F2 == _AP_F)
-      Base::V = op.V;
-    else if (F2 > _AP_F) {
-      unsigned int sh_amt = F2 - _AP_F;
-      //  moves bits right, handle quantization.
-      if (sh_amt < _AP_W2) {
-        Base::V = op.V >> sh_amt;
-      } else {
-        Base::V = isneg ? -1 : 0;
-      }
-      if (_AP_Q != AP_TRN && !(_AP_Q == AP_TRN_ZERO && !_AP_S2)) {
-        bool qbit = _AP_ROOT_op_get_bit(op.V, F2 - _AP_F - 1);
-        // bit after LSB.
-        bool qb = (F2 - _AP_F > _AP_W2) ? _AP_S2 && signbit : qbit;
-        enum { hi = ((F2 - _AP_F - 2) < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1) };
-        // bits after qb.
-        bool r = (F2 > _AP_F + 1) ? (_AP_ROOT_op_get_range(op.V, 0, hi) != 0) : false;
-        carry = quantization_adjust(qb, r, isneg);
-      }
-    } else {
-      unsigned  sh_amt = _AP_F - F2;
-      // moves bits left, no quantization
-      if (sh_amt < _AP_W) {
-        if (_AP_W > _AP_W2) {
-          // extend and then shift, avoid losing bits.
-          Base::V = op.V;
-          Base::V <<= sh_amt;
-        } else {
-          // shift and truncate.
-          Base::V = op.V << sh_amt;
-        }
-      } else {
-        Base::V = 0;
-      }
-    }
-    // handle overflow/underflow
-    if ((_AP_O != AP_WRAP || _AP_N != 0) &&
-        ((!_AP_S && _AP_S2) ||
-         _AP_I - _AP_S <
-             _AP_I2 - _AP_S2 +
-                 (QUAN_INC || (_AP_S2 && _AP_O == AP_SAT_SYM)))) { // saturation
-      bool deleted_zeros = _AP_S2 ? true : !carry;
-      bool deleted_ones = true;
-      bool neg_src = isneg;
-      bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-      enum { pos1 = F2 - _AP_F + _AP_W, pos2 = F2 - _AP_F + _AP_W + 1 };
-      bool lD = (pos1 < _AP_W2 && pos1 >= 0) ? _AP_ROOT_op_get_bit(op.V, pos1)
-                                             : false;
-      if (pos1 < _AP_W2) {
-        bool Range1_all_ones = true;
-        bool Range1_all_zeros = true;
-        bool Range2_all_ones = true;
-        ap_int_base<_AP_W2, false> all_ones(-1);
-
-        if (pos2 < _AP_W2 && pos2 >= 0) {
-          ap_int_base<_AP_W2, false> Range2;
-          Range2.V = _AP_ROOT_op_get_range(op.V, pos2, _AP_W2 - 1);
-          Range2_all_ones = Range2 == (all_ones >> pos2);
-        } else if (pos2 < 0) {
-          Range2_all_ones = false;
-        }
-
-        if (pos1 >= 0 && pos2 < _AP_W2) {
-          ap_int_base<_AP_W2, false> Range1;
-          Range1.V = _AP_ROOT_op_get_range(op.V, pos1, _AP_W2 - 1);
-          Range1_all_ones = Range1 == (all_ones >> pos1);
-          Range1_all_zeros = !Range1.V;
-        } else if (pos2 == _AP_W2) {
-          Range1_all_ones = lD;
-          Range1_all_zeros = !lD;
-        } else if (pos1 < 0) {
-          Range1_all_zeros = !op.V;
-          Range1_all_ones = false;
-        }
-
-        deleted_zeros =
-            deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
-        deleted_ones =
-            carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
-        neg_src = isneg && !(carry && Range1_all_ones);
-      } else
-        neg_src = isneg && newsignbit;
-      bool neg_trg = _AP_S && newsignbit;
-      bool overflow = (neg_trg || !deleted_zeros) && !isneg;
-      bool underflow = (!neg_trg || !deleted_ones) && neg_src;
-      if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
-        underflow |=
-            neg_src &&
-            (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
-                       : true);
-
-      overflow_adjust(underflow, overflow, lD, neg_src);
-    }
-    return *this;
-  } // operator= 
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator=(
-      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    operator=(const_cast<const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(op));
-    return *this;
-  }
-
-  /// Set this ap_fixed_base with ULL.
-  INLINE ap_fixed_base& setBits(ap_ulong bv) {
-    // TODO when ull is not be long enough...
-    Base::V = bv;
-    return *this;
-  }
-
-  /// Return a ap_fixed_base object whose this->V is assigned by bv.
-  static INLINE ap_fixed_base bitsToFixed(ap_ulong bv) {
-    // TODO fix when ull is not be long enough...
-    ap_fixed_base t;
-#ifdef __SYNTHESIS__
-    t.V = bv;
-#else
-    t.V.set_bits(bv);
-#endif
-    return t;
-  }
-
-  // Explicit conversion functions to ap_int_base.
-  /** Captures all integer bits, in truncate mode.
-   *  @param[in] Cnative follow conversion from double to int.
-   */
-  INLINE ap_int_base<AP_MAX(_AP_I, 1), _AP_S> to_ap_int_base(
-      bool Cnative = true) const {
-    ap_int_base<AP_MAX(_AP_I, 1), _AP_S> ret;
-    if (_AP_I == 0) {
-      ret.V = 0;
-    } else if (_AP_I > 0 && _AP_I <= _AP_W) {
-      ret.V = _AP_ROOT_op_get_range(Base::V, _AP_W - _AP_I, _AP_W - 1);
-    } else if (_AP_I > _AP_W) {
-      ret.V = _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 1);
-      ret.V <<= (_AP_I - _AP_W);
-    }
-    /* Consider the following case
-     *   float f = -7.5f;
-     *   ap_fixed<8,4> t = f;  // -8 0 0 0 . 0.5
-     *   int i = t.to_int();
-     * the result should be -7 instead of -8.
-     * Therefore, after truncation, the value should be increated by 1.
-     * For (-1, 0), carry to MSB will happen, but result 0 is still correct.
-     */
-    if (Cnative && _AP_I < _AP_W) {
-      // Follow C native data type, conversion from double to int
-      if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1) && (_AP_I < _AP_W) &&
-          (_AP_ROOT_op_get_range(
-               Base::V, 0, _AP_I < 0 ? _AP_W - 1 : _AP_W - _AP_I - 1) != 0))
-        ++ret;
-    } else {
-      // Follow OSCI library, conversion from sc_fixed to sc_int
-    }
-    return ret;
-  };
-
- public:
-  template <int _AP_W2, bool _AP_S2>
-  INLINE operator ap_int_base<_AP_W2, _AP_S2>() const {
-    return ap_int_base<_AP_W2, _AP_S2>(to_ap_int_base());
-  }
-
-  // Explicit conversion function to C built-in integral type.
-  INLINE char to_char() const { return to_ap_int_base().to_char(); }
-
-  INLINE int to_int() const { return to_ap_int_base().to_int(); }
-
-  INLINE unsigned to_uint() const { return to_ap_int_base().to_uint(); }
-
-  INLINE ap_slong to_int64() const { return to_ap_int_base().to_int64(); }
-
-  INLINE ap_ulong to_uint64() const { return to_ap_int_base().to_uint64(); }
-
-  /// covert function to double.
-  /** only round-half-to-even mode supported, does not obey FE env. */
-  INLINE double to_double() const {
-#if defined(AP_FIXED_ENABLE_CPP_FENV)
-    _AP_WARNING(std::fegetround() != FE_TONEAREST,
-                "Only FE_TONEAREST is supported");
-#endif
-    enum { BITS = DOUBLE_MAN + DOUBLE_EXP + 1 };
-    if (!Base::V) return 0.0f;
-    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
-    ap_int_base<_AP_W, false> tmp;
-    if (s)
-      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
-    else
-      tmp.V = Base::V;
-    int l = tmp.countLeadingZeros(); ///< number of leading zeros.
-    int e = _AP_I - l - 1 + DOUBLE_BIAS; ///< exponent
-    int lsb_index = _AP_W - l - 1 - DOUBLE_MAN;
-    // more than 0.5?
-    bool a = (lsb_index >=2) ?
-        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
-    // round to even
-    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
-    // ull is at least 64-bit
-    ap_ulong m;
-    // may actually left shift, ensure buffer is wide enough.
-    if (_AP_W > BITS) {
-      m = (lsb_index >= 1) ? (ap_ulong)(tmp.V >> (lsb_index - 1))
-                           : (ap_ulong)(tmp.V << (1 - lsb_index));
-    } else {
-      m = (ap_ulong)tmp.V;
-      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
-                           : (m << (1 - lsb_index));
-    }
-    m += a;
-    m >>= 1;
-    //std::cout << '\n' << std::hex << m << '\n'; // TODO delete this
-    // carry to MSB, increase exponent
-    if (_AP_ctype_op_get_bit(m, DOUBLE_MAN + 1)) {
-      e += 1;
-    }
-    // set sign and exponent
-    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
-    //std::cout << m << '\n'; // TODO delete this
-    m = _AP_ctype_op_set_range(m, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1, e);
-    //std::cout << std::hex << m << std::dec << std::endl; // TODO delete this
-    // cast to fp
-    return rawBitsToDouble(m);
-  }
-
-  /// convert function to float.
-  /** only round-half-to-even mode supported, does not obey FE env. */
-  INLINE float to_float() const {
-#if defined(AP_FIXED_ENABLE_CPP_FENV)
-    _AP_WARNING(std::fegetround() != FE_TONEAREST,
-                "Only FE_TONEAREST is supported");
-#endif
-    enum { BITS = FLOAT_MAN + FLOAT_EXP + 1 };
-    if (!Base::V) return 0.0f;
-    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
-    ap_int_base<_AP_W, false> tmp;
-    if (s)
-      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
-    else
-      tmp.V = Base::V;
-    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
-    int e = _AP_I - l - 1 + FLOAT_BIAS; ///< exponent
-    int lsb_index = _AP_W - l - 1 - FLOAT_MAN;
-    // more than 0.5?
-    bool a = (lsb_index >=2) ?
-        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
-    // round to even
-    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
-    // ul is at least 32-bit
-    unsigned long m;
-    // may actually left shift, ensure buffer is wide enough.
-    if (_AP_W > BITS) {
-      m = (lsb_index >= 1) ? (unsigned long)(tmp.V >> (lsb_index - 1))
-                           : (unsigned long)(tmp.V << (1 - lsb_index));
-    } else {
-      m = (unsigned long)tmp.V;
-      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
-                           : (m << (1 - lsb_index));
-    }
-    m += a;
-    m >>= 1;
-    // carry to MSB, increase exponent
-    if (_AP_ctype_op_get_bit(m, FLOAT_MAN + 1)) {
-      e += 1;
-    }
-    // set sign and exponent
-    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
-    m = _AP_ctype_op_set_range(m, FLOAT_MAN, FLOAT_MAN + FLOAT_EXP - 1, e);
-    // cast to fp
-    return rawBitsToFloat(m);
-  }
-
-#if _AP_ENABLE_HALF_ == 1
-  /// convert function to half.
-  /** only round-half-to-even mode supported, does not obey FE env. */
-  INLINE half to_half() const {
-#if defined(AP_FIXED_ENABLE_CPP_FENV)
-    _AP_WARNING(std::fegetround() != FE_TONEAREST,
-                "Only FE_TONEAREST is supported");
-#endif
-    enum { BITS = HALF_MAN + HALF_EXP + 1 };
-    if (!Base::V) return 0.0f;
-    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
-    ap_int_base<_AP_W, false> tmp;
-    if (s)
-      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
-    else
-      tmp.V = Base::V;
-    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
-    int e = _AP_I - l - 1 + HALF_BIAS; ///< exponent
-    int lsb_index = _AP_W - l - 1 - HALF_MAN;
-    // more than 0.5?
-    bool a = (lsb_index >=2) ?
-        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
-    // round to even
-    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
-    // short is at least 16-bit
-    unsigned short m;
-    // may actually left shift, ensure buffer is wide enough.
-    if (_AP_W > BITS) {
-      m = (lsb_index >= 1) ? (unsigned short)(tmp.V >> (lsb_index - 1))
-                           : (unsigned short)(tmp.V << (1 - lsb_index));
-    } else {
-      m = (unsigned short)tmp.V;
-      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
-                           : (m << (1 - lsb_index));
-    }
-    m += a;
-    m >>= 1;
-    // carry to MSB, increase exponent
-    if (_AP_ctype_op_get_bit(m, HALF_MAN + 1)) {
-      e += 1;
-    }
-    // set sign and exponent
-    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
-    m = _AP_ctype_op_set_range(m, HALF_MAN, HALF_MAN + HALF_EXP - 1, e);
-    // cast to fp
-    return rawBitsToHalf(m);
-  }
-#endif
-
-  // FIXME inherited from old code, this may loose precision!
-  INLINE operator long double() const { return (long double)to_double(); }
-
-  INLINE operator double() const { return to_double(); }
-
-  INLINE operator float() const { return to_float(); }
-
-#if _AP_ENABLE_HALF_ == 1
-  INLINE operator half() const { return to_half(); }
-#endif
-
-  INLINE operator bool() const { return (bool)Base::V != 0; }
-
-  INLINE operator char() const { return (char)to_int(); }
-
-  INLINE operator signed char() const { return (signed char)to_int(); }
-
-  INLINE operator unsigned char() const { return (unsigned char)to_uint(); }
-
-  INLINE operator short() const { return (short)to_int(); }
-
-  INLINE operator unsigned short() const { return (unsigned short)to_uint(); }
-
-  INLINE operator int() const { return to_int(); }
-
-  INLINE operator unsigned int() const { return to_uint(); }
-
-// FIXME don't assume data width...
-#ifdef __x86_64__
-  INLINE operator long() const { return (long)to_int64(); }
-
-  INLINE operator unsigned long() const { return (unsigned long)to_uint64(); }
-#else
-  INLINE operator long() const { return (long)to_int(); }
-
-  INLINE operator unsigned long() const { return (unsigned long)to_uint(); }
-#endif // ifdef __x86_64__ else
-
-  INLINE operator ap_ulong() const { return to_uint64(); }
-
-  INLINE operator ap_slong() const { return to_int64(); }
-
-  INLINE int length() const { return _AP_W; };
-
-  // bits_to_int64 deleted.
-#ifndef __SYNTHESIS__
-  // Used in autowrap, when _AP_W < 64.
-  INLINE ap_ulong bits_to_uint64() const {
-    return (Base::V).to_uint64();
-  }
-#endif
-
-  // Count the number of zeros from the most significant bit
-  // to the first one bit. Note this is only for ap_fixed_base whose
-  // _AP_W <= 64, otherwise will incur assertion.
-  INLINE int countLeadingZeros() {
-#ifdef __SYNTHESIS__
-    // TODO: used llvm.ctlz intrinsic ?
-    if (_AP_W <= 32) {
-      ap_int_base<32, false> t(-1ULL);
-      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
-      return __builtin_ctz(t.V);
-    } else if (_AP_W <= 64) {
-      ap_int_base<64, false> t(-1ULL);
-      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
-      return __builtin_ctzll(t.V);
-    } else {
-      enum {__N = (_AP_W + 63) / 64};
-      int NZeros = 0;
-      int i = 0;
-      bool hitNonZero = false;
-      for (i = 0; i < __N - 1; ++i) {
-        ap_int_base<64, false> t;
-        t.range(0, 63) = this->range(_AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
-        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V);
-        hitNonZero |= (t != 0);
-      }
-      if (!hitNonZero) {
-        ap_int_base<64, false> t(-1ULL);
-        t.range(63 - (_AP_W - 1) % 64, 63) = this->range(0, (_AP_W - 1) % 64);
-        NZeros += __builtin_clzll(t.V);
-      }
-      return NZeros;
-    }
-#else
-    return Base::V.countLeadingZeros();
-#endif
-  }
-
-  // Arithmetic : Binary
-  // -------------------------------------------------------------------------
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::mult operator*(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
-      const {
-    typename RType<_AP_W2, _AP_I2, _AP_S2>::mult_base r, t;
-    r.V = Base::V;
-    t.V = op2.V;
-    r.V *= op2.V;
-    return r;
-  }
-
-  // multiply function deleted.
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::div operator/(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
-      const {
-    typename RType<_AP_W2, _AP_I2, _AP_S2>::div_base r;
-#ifndef __SYNTHESIS__
-    enum {F2 = _AP_W2-_AP_I2,
-              _W1=AP_MAX(_AP_W + AP_MAX(F2, 0) + ((_AP_S2 && !_AP_S) ? 1 : 0), _AP_W2 + ((_AP_S && !_AP_S2) ? 1 : 0))};
-    ap_int_base<_W1,_AP_S||_AP_S2> dividend,divisior;
-    ap_int_base<_W1,_AP_S> tmp1;
-    ap_int_base<_W1,_AP_S2> tmp2;
-    tmp1.V = Base::V;
-    tmp1.V <<= AP_MAX(F2,0);
-    tmp2.V = op2.V;
-    dividend = tmp1;
-    divisior = tmp2;
-    r.V = ((_AP_S||_AP_S2) ? dividend.V.sdiv(divisior.V): dividend.V.udiv(divisior.V));
-#else
-    #ifndef __SC_COMPATIBLE__
-        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0),_AP_I, _AP_S> t(*this);
-    #else
-        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0) + AP_MAX(_AP_I2, 0),_AP_I, _AP_S> t(*this);
-    #endif
-        r.V = t.V / op2.V;
-#endif
-/*
-    enum {
-      F2 = _AP_W2 - _AP_I2,
-      shl = AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
-#ifndef __SC_COMPATIBLE__
-      shr = AP_MAX(_AP_I2, 0),
-#else
-      shr = 0,
-#endif
-      W3 = _AP_S2 + _AP_W + shl,
-      S3 = _AP_S || _AP_S2,
-    };
-    ap_int_base<W3, S3> dividend, t;
-    dividend.V = Base::V;
-    // multiply both by (1 << F2), and than do integer division.
-    dividend.V <<= (int) shl;
-#ifdef __SYNTHESIS__
-    // .V's have right signedness, and will have right extending.
-    t.V = dividend.V / op2.V;
-#else
-    // XXX op2 may be wider than dividend, and sdiv and udiv takes the same with
-    // as left hand operand, so data might be truncated by mistake if not
-    // handled here.
-    t.V = S3 ? dividend.V.sdiv(op2.V) : dividend.V.udiv(op2.V);
-#endif
-    r.V = t.V >> (int) shr;
-*/
-    return r;
-  }
-
-#define OP_BIN_AF(Sym, Rty)                                                \
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
-            ap_o_mode _AP_O2, int _AP_N2>                                  \
-  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty operator Sym(         \
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
-          op2) const {                                                     \
-    typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty##_base ret, lhs(*this),    \
-        rhs(op2);                                                          \
-    ret.V = lhs.V Sym rhs.V;                                               \
-    return ret;                                                            \
-  }
-
-  OP_BIN_AF(+, plus)
-  OP_BIN_AF(-, minus)
-  OP_BIN_AF(&, logic)
-  OP_BIN_AF(|, logic)
-  OP_BIN_AF(^, logic)
-
-// Arithmetic : assign
-// -------------------------------------------------------------------------
-#define OP_ASSIGN_AF(Sym)                                                  \
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
-            ap_o_mode _AP_O2, int _AP_N2>                                  \
-  INLINE ap_fixed_base& operator Sym##=(                                   \
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
-          op2) {                                                           \
-    *this = operator Sym(op2);                                             \
-    return *this;                                                          \
-  }
-
-  OP_ASSIGN_AF(*)
-  OP_ASSIGN_AF(/)
-  OP_ASSIGN_AF(+)
-  OP_ASSIGN_AF(-)
-  OP_ASSIGN_AF(&)
-  OP_ASSIGN_AF(|)
-  OP_ASSIGN_AF(^)
-
-  // Prefix and postfix increment and decrement.
-  // -------------------------------------------------------------------------
-
-  /// Prefix increment
-  INLINE ap_fixed_base& operator++() {
-    operator+=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
-    return *this;
-  }
-
-  /// Prefix decrement.
-  INLINE ap_fixed_base& operator--() {
-    operator-=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
-    return *this;
-  }
-
-  /// Postfix increment
-  INLINE const ap_fixed_base operator++(int) {
-    ap_fixed_base r(*this);
-    operator++();
-    return r;
-  }
-
-  /// Postfix decrement
-  INLINE const ap_fixed_base operator--(int) {
-    ap_fixed_base r(*this);
-    operator--();
-    return r;
-  }
-
-  // Unary arithmetic.
-  // -------------------------------------------------------------------------
-  INLINE ap_fixed_base operator+() { return *this; }
-
-  INLINE ap_fixed_base<_AP_W + 1, _AP_I + 1, true> operator-() const {
-    ap_fixed_base<_AP_W + 1, _AP_I + 1, true> r(*this);
-    r.V = -r.V;
-    return r;
-  }
-
-  INLINE ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> getNeg() {
-    ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> r(*this);
-    r.V = -r.V;
-    return r;
-  }
-
-  // Not (!)
-  // -------------------------------------------------------------------------
-  INLINE bool operator!() const { return Base::V == 0; }
-
-  // Bitwise complement
-  // -------------------------------------------------------------------------
-  // XXX different from Mentor's ac_fixed.
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S> operator~() const {
-    ap_fixed_base<_AP_W, _AP_I, _AP_S> r;
-    r.V = ~Base::V;
-    return r;
-  }
-
-  // Shift
-  // -------------------------------------------------------------------------
-  // left shift is the same as moving point right, i.e. increate I.
-  template <int _AP_SHIFT>
-  INLINE ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> lshift() const {
-    ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> r;
-    r.V = Base::V;
-    return r;
-  }
-
-  template <int _AP_SHIFT>
-  INLINE ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> rshift() const {
-    ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> r;
-    r.V = Base::V;
-    return r;
-  }
-
-  // Because the return type is the type of the the first operand, shift assign
-  // operators do not carry out any quantization or overflow
-  // While systemc, shift assigns for sc_fixed/sc_ufixed will result in
-  // quantization or overflow (depending on the mode of the first operand)
-  INLINE ap_fixed_base operator<<(unsigned int sh) const {
-    ap_fixed_base r;
-    r.V = Base::V << sh;
-// TODO check shift overflow?
-#ifdef __SC_COMPATIBLE__
-    if (sh == 0) return r;
-    if (_AP_O != AP_WRAP || _AP_N != 0) {
-      bool neg_src = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
-      bool allones, allzeros;
-      ap_int_base<_AP_W, false> ones(-1);
-      if (sh <= _AP_W) {
-        ap_int_base<_AP_W, false> range1;
-        range1.V = _AP_ROOT_op_get_range(
-            const_cast<ap_fixed_base*>(this)->Base::V, _AP_W - sh, _AP_W - 1);
-        allones = range1 == (ones >> (_AP_W - sh));
-        allzeros = range1 == 0;
-      } else {
-        allones = false;
-        allzeros = Base::V == 0;
-      }
-      bool overflow = !allzeros && !neg_src;
-      bool underflow = !allones && neg_src;
-      if ((_AP_O == AP_SAT_SYM) && _AP_S)
-        underflow |=
-            neg_src &&
-            (_AP_W > 1 ? _AP_ROOT_op_get_range(r.V, 0, _AP_W - 2) == 0 : true);
-      bool lD = false;
-      if (sh < _AP_W) lD = _AP_ROOT_op_get_bit(Base::V, _AP_W - sh - 1);
-      r.overflow_adjust(underflow, overflow, lD, neg_src);
-    }
-#endif
-    return r;
-  }
-
-  INLINE ap_fixed_base operator>>(unsigned int sh) const {
-    ap_fixed_base r;
-    r.V = Base::V >> sh;
-// TODO check shift overflow?
-#ifdef __SC_COMPATIBLE__
-    if (sh == 0) return r;
-    if (_AP_Q != AP_TRN) {
-      bool qb = false;
-      if (sh <= _AP_W) qb = _AP_ROOT_op_get_bit(Base::V, sh - 1);
-      bool rb = false;
-      if (sh > 1 && sh <= _AP_W)
-        rb = _AP_ROOT_op_get_range(const_cast<ap_fixed_base*>(this)->Base::V, 0,
-                                   sh - 2) != 0;
-      else if (sh > _AP_W)
-        rb = Base::V != 0;
-      r.quantization_adjust(qb, rb,
-                            _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
-    }
-#endif
-    return r;
-  }
-
-  // left and right shift for int
-  INLINE ap_fixed_base operator<<(int sh) const {
-    ap_fixed_base r;
-    bool isNeg = sh < 0;
-    unsigned int ush = isNeg ? -sh : sh;
-    if (isNeg) {
-      return operator>>(ush);
-    } else {
-      return operator<<(ush);
-    }
-  }
-
-  INLINE ap_fixed_base operator>>(int sh) const {
-    bool isNeg = sh < 0;
-    unsigned int ush = isNeg ? -sh : sh;
-    if (isNeg) {
-      return operator<<(ush);
-    } else {
-      return operator>>(ush);
-    }
-  }
-
-  // left and right shift for ap_int.
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, true>& op2) const {
-    // TODO the code seems not optimal. ap_fixed<8,8> << ap_int<2> needs only a
-    // small mux, but integer need a big one!
-    int sh = op2.to_int();
-    return operator<<(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, true>& op2) const {
-    int sh = op2.to_int();
-    return operator>>(sh);
-  }
-
-  // left and right shift for ap_uint.
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, false>& op2) const {
-    unsigned int sh = op2.to_uint();
-    return operator<<(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, false>& op2) const {
-    unsigned int sh = op2.to_uint();
-    return operator>>(sh);
-  }
-
-  // left and right shift for ap_fixed
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base operator<<(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          op2) {
-    return operator<<(op2.to_ap_int_base());
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base operator>>(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          op2) {
-    return operator>>(op2.to_ap_int_base());
-  }
-
-  // Shift assign.
-  // -------------------------------------------------------------------------
-
-  // left shift assign.
-  INLINE ap_fixed_base& operator<<=(const int sh) {
-    *this = operator<<(sh);
-    return *this;
-  }
-
-  INLINE ap_fixed_base& operator<<=(const unsigned int sh) {
-    *this = operator<<(sh);
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base& operator<<=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
-    *this = operator<<(sh.to_int());
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator<<=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          sh) {
-    *this = operator<<(sh.to_int());
-    return *this;
-  }
-
-  // right shift assign.
-  INLINE ap_fixed_base& operator>>=(const int sh) {
-    *this = operator>>(sh);
-    return *this;
-  }
-
-  INLINE ap_fixed_base& operator>>=(const unsigned int sh) {
-    *this = operator>>(sh);
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_fixed_base& operator>>=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
-    *this = operator>>(sh.to_int());
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_fixed_base& operator>>=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          sh) {
-    *this = operator>>(sh.to_int());
-    return *this;
-  }
-
-// Comparisons.
-// -------------------------------------------------------------------------
-#define OP_CMP_AF(Sym)                                                         \
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,             \
-            ap_o_mode _AP_O2, int _AP_N2>                                      \
-  INLINE bool operator Sym(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, \
-                                               _AP_O2, _AP_N2>& op2) const {   \
-    enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 };                      \
-    if (_AP_F == F2)                                                           \
-      return Base::V Sym op2.V;                                                \
-    else if (_AP_F > F2)                                                       \
-      return Base::V Sym ap_fixed_base<AP_MAX(_AP_W2 + _AP_F - F2, 1), _AP_I2, \
-                                       _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; \
-    else                                                                       \
-      return ap_fixed_base<AP_MAX(_AP_W + F2 - _AP_F + 1, 1), _AP_I + 1,       \
-                           _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V Sym op2.V;     \
-    return false;                                                              \
-  }
-
-  OP_CMP_AF(>)
-  OP_CMP_AF(<)
-  OP_CMP_AF(>=)
-  OP_CMP_AF(<=)
-  OP_CMP_AF(==)
-  OP_CMP_AF(!=)
-// FIXME: Move compare with double out of struct ap_fixed_base defination
-//        and combine it with compare operator(double, ap_fixed_base)
-#define DOUBLE_CMP_AF(Sym) \
-  INLINE bool operator Sym(double d) const { return to_double() Sym d; }
-
-  DOUBLE_CMP_AF(>)
-  DOUBLE_CMP_AF(<)
-  DOUBLE_CMP_AF(>=)
-  DOUBLE_CMP_AF(<=)
-  DOUBLE_CMP_AF(==)
-  DOUBLE_CMP_AF(!=)
-
-  // Bit and Slice Select
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
-      unsigned index) {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
-                                                                index.to_int());
-  }
-
-  INLINE bool operator[](unsigned index) const {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
-  }
-
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
-      unsigned index) {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
-                                                                index.to_int());
-  }
-
-  INLINE bool bit(unsigned index) const {
-    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
-  }
-
-  template <int _AP_W2>
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
-      const ap_int_base<_AP_W2, true>& index) {
-    _AP_WARNING(index < _AP_I - _AP_W,
-                "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
-        this, index.to_int() + _AP_W - _AP_I);
-  }
-
-  INLINE bool get_bit(int index) const {
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
-                               index + _AP_W - _AP_I);
-  }
-#if 0
-  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
-      int index) {
-    _AP_WARNING(index < _AP_I - _AP_W,
-              "Attempting to read bit with negative index");
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
-        this, index + _AP_W - _AP_I);
-  }
-#endif
-
-  template <int _AP_W2>
-  INLINE bool get_bit(const ap_int_base<_AP_W2, true>& index) const {
-    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
-    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
-    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
-                               index.to_int() + _AP_W - _AP_I);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(int Hi,
-                                                                      int Lo) {
-    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
-    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, Hi, Lo);
-  }
-
-  // This is a must to strip constness to produce reference type.
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
-      int Hi, int Lo) const {
-    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
-    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
-        const_cast<ap_fixed_base*>(this), Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() const {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      int Hi, int Lo) {
-    return this->range(Hi, Lo);
-  }
-
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      int Hi, int Lo) const {
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE bool is_zero() const { return Base::V == 0; }
-
-  INLINE bool is_neg() const {
-    if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)) return true;
-    return false;
-  }
-
-  INLINE int wl() const { return _AP_W; }
-
-  INLINE int iwl() const { return _AP_I; }
-
-  INLINE ap_q_mode q_mode() const { return _AP_Q; }
-
-  INLINE ap_o_mode o_mode() const { return _AP_O; }
-
-  INLINE int n_bits() const { return _AP_N; }
-
-  // print a string representation of this number in the given radix.
-  // Radix support is 2, 8, 10, or 16.
-  // The result will include a prefix indicating the radix, except for decimal,
-  // where no prefix is needed.  The default is to output a signed representation
-  // of signed numbers, or an unsigned representation  of unsigned numbers.  For
-  // non-decimal formats, this can be changed by the 'sign' argument.
-#ifndef __SYNTHESIS__
-  std::string to_string(unsigned char radix = 2, bool sign = _AP_S) const {
-    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
-    // initialize sc_lv, which seems incapable of handling format "-0b".
-    if (radix == 2) sign = false;
-
-    std::string str;
-    str.clear();
-    char step = 0;
-    bool isNeg = sign && (Base::V < 0);
-
-    // Extend to take care of the -MAX case.
-    ap_fixed_base<_AP_W + 1, _AP_I + 1> tmp(*this);
-    if (isNeg) {
-      tmp = -tmp;
-      str += '-';
-    }
-    std::string prefix;
-    switch (radix) {
-      case 2:
-        prefix = "0b";
-        step = 1;
-        break;
-      case 8:
-        prefix = "0o";
-        step = 3;
-        break;
-      case 16:
-        prefix = "0x";
-        step = 4;
-        break;
-      default:
-        break;
-    }
-
-    if (_AP_I > 0) {
-      // Note we drop the quantization and rounding flags here.  The
-      // integer part is always in range, and the fractional part we
-      // want to drop.  Also, the number is always positive, because
-      // of the absolute value above.
-      ap_int_base<AP_MAX(_AP_I + 1, 1), false> int_part;
-      //   [1] [ I ] d [ W - I ]
-      //    |     |            |
-      //    |    W-I           0
-      //    W
-      int_part.V = _AP_ROOT_op_get_range(
-          tmp.V, _AP_W - _AP_I, _AP_W);
-      str += int_part.to_string(radix, false);
-    } else {
-      str += prefix;
-      str += '0';
-    }
-
-    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 1), 0, false> frac_part = tmp;
-
-    if (radix == 10) {
-      if (frac_part != 0) {
-        str += ".";
-        while (frac_part != 0) {
-          char digit = (frac_part * radix).to_char();
-          str += static_cast<char>(digit + '0');
-          frac_part *= radix;
-        }
-      }
-    } else {
-      if (frac_part != 0) {
-        str += ".";
-        for (signed i = _AP_W - _AP_I - 1; i >= 0; i -= step) {
-          char digit = frac_part.range(i, AP_MAX(0, i - step + 1)).to_char();
-          // If we have a partial bit pattern at the end, then we need
-          // to put it in the high-order bits of 'digit'.
-          int offset = AP_MIN(0, i - step + 1);
-          digit <<= -offset;
-          str += digit < 10 ? static_cast<char>(digit + '0')
-                            : static_cast<char>(digit - 10 + 'a');
-        }
-        if (radix == 16)
-          str += "p0"; // C99 Hex constants are required to have an exponent.
-      }
-    }
-    return str;
-  }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string(unsigned char radix = 2, bool sign = _AP_S) const {
-    return 0;
-  }
-#endif
-}; // struct ap_fixed_base.
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_not(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
-  ret.V = ~op.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_and(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  ret.V = op1.V & op2.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_or(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  ret.V = op1.V | op2.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void b_xor(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  ret.V = op1.V ^ op2.V;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-          ap_o_mode _AP_O2, int _AP_N2>
-INLINE void neg(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-  ap_fixed_base<_AP_W2 + !_AP_S2, _AP_I2 + !_AP_S2, true, _AP_Q2, _AP_O2,
-                _AP_N2>
-      t;
-  t.V = -op.V;
-  ret = t;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-          ap_o_mode _AP_O2, int _AP_N2>
-INLINE void lshift(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
-    int i) {
-  enum {
-    F2 = _AP_W2 - _AP_I2,
-    _AP_I3 = AP_MAX(_AP_I, _AP_I2),
-    _AP_W3 = _AP_I3 + F2,
-  };
-  // wide buffer
-  ap_fixed_base<_AP_W3, _AP_I3, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
-  t.V = op.V;
-  t.V <<= i; // FIXME overflow?
-  // handle quantization and overflow
-  ret = t;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-          ap_o_mode _AP_O2, int _AP_N2>
-INLINE void rshift(
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
-    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
-    int i) {
-  enum {
-    F = _AP_W - _AP_I,
-    F2 = _AP_W2 - _AP_I2,
-    F3 = AP_MAX(F, F2),
-    _AP_W3 = _AP_I2 + F3,
-    sh = F - F2,
-  };
-  // wide buffer
-  ap_fixed_base<_AP_W3, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
-  t.V = op.V;
-  if (sh >= 0)
-    t.V <<= (int) sh;
-  t.V >>= i;
-  // handle quantization and overflow
-  ret = t;
-}
-
-//// FIXME
-//// These partial specialization ctors allow code like
-////   char c = 'a';
-////   ap_fixed_base<8, 8, true> x(c);
-//// but what bout ap_fixed_base<9, 9, true> y(c) ?
-//
-
-#ifndef __SYNTHESIS__
-INLINE std::string scientificFormat(std::string& input) {
-  if (input.length() == 0) return input;
-
-  size_t decPosition = input.find('.');
-  if (decPosition == std::string::npos) decPosition = input.length();
-
-  size_t firstNonZeroPos = 0;
-  for (; input[firstNonZeroPos] > '9' || input[firstNonZeroPos] < '1';
-       firstNonZeroPos++)
-    ;
-
-  int exp;
-  if (firstNonZeroPos > decPosition)
-    exp = decPosition - firstNonZeroPos;
-  else
-    exp = decPosition - firstNonZeroPos - 1;
-  std::string expString = "";
-  if (exp == 0)
-    ;
-  else if (exp < 0) {
-    expString += "e-";
-    exp = -exp;
-  } else
-    expString += "e+";
-
-  if (exp < 10 && exp > 0) {
-    expString += '0';
-    expString += (char)('0' + exp);
-  } else if (exp != 0) {
-    std::string tmp;
-
-    std::ostringstream oss;
-    oss << exp;
-
-    tmp = oss.str();
-    expString += tmp;
-  }
-
-  int lastNonZeroPos = (int)(input.length() - 1);
-  for (; lastNonZeroPos >= 0; --lastNonZeroPos)
-    if (input[lastNonZeroPos] <= '9' && input[lastNonZeroPos] > '0') break;
-
-  std::string ans = "";
-  ans += input[firstNonZeroPos];
-  if (firstNonZeroPos != (size_t)lastNonZeroPos) {
-    ans += '.';
-    for (int i = firstNonZeroPos + 1; i <= lastNonZeroPos; i++)
-      if (input[i] != '.') ans += input[i];
-  }
-
-  ans += expString;
-  return ans;
-}
-
-INLINE std::string reduceToPrecision(std::string& input, int precision) {
-  bool isZero = true;
-  size_t inputLen = input.length();
-  for (size_t i = 0; i < inputLen && isZero; i++)
-    if (input[i] != '.' && input[i] != '0') isZero = false;
-  if (isZero) return "0";
-
-  // Find the first valid number, skip '-'
-  int FirstNonZeroPos = 0;
-  int LastNonZeroPos = (int)inputLen - 1;
-  int truncBitPosition = 0;
-  size_t decPosition = input.find('.');
-  for (; input[FirstNonZeroPos] < '1' || input[FirstNonZeroPos] > '9';
-       FirstNonZeroPos++)
-    ;
-
-  for (; input[LastNonZeroPos] < '1' || input[LastNonZeroPos] > '9';
-       LastNonZeroPos--)
-    ;
-
-  if (decPosition == std::string::npos) decPosition = inputLen;
-  // Count the valid number, to decide whether we need to truncate
-  if ((int)decPosition > LastNonZeroPos) {
-    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) return input;
-    truncBitPosition = FirstNonZeroPos + precision;
-  } else if ((int)decPosition < FirstNonZeroPos) { // This is pure decimal
-    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) {
-      if (FirstNonZeroPos - decPosition - 1 < 4) {
-        return input;
-      } else {
-        if (input[0] == '-') {
-          std::string tmp = input.substr(1, inputLen - 1);
-          return std::string("-") + scientificFormat(tmp);
-        } else
-          return scientificFormat(input);
-      }
-    }
-    truncBitPosition = FirstNonZeroPos + precision;
-  } else {
-    if (LastNonZeroPos - FirstNonZeroPos <= precision) return input;
-    truncBitPosition = FirstNonZeroPos + precision + 1;
-  }
-
-  // duplicate the input string, we want to add "0" before the valid numbers
-  // This is easy for quantization, since we may change 9999 to 10000
-  std::string ans = "";
-  std::string dupInput = "0";
-  if (input[0] == '-') {
-    ans += '-';
-    dupInput += input.substr(1, inputLen - 1);
-  } else {
-    dupInput += input.substr(0, inputLen);
-    ++truncBitPosition;
-  }
-
-  // Add 'carry' after truncation, if necessary
-  bool carry = dupInput[truncBitPosition] > '4';
-  for (int i = truncBitPosition - 1; i >= 0 && carry; i--) {
-    if (dupInput[i] == '.') continue;
-    if (dupInput[i] == '9')
-      dupInput[i] = '0';
-    else {
-      ++dupInput[i];
-      carry = false;
-    }
-  }
-
-  // bits outside precision range should be set to 0
-  if (dupInput[0] == '1')
-    FirstNonZeroPos = 0;
-  else {
-    FirstNonZeroPos = 0;
-    while (dupInput[FirstNonZeroPos] < '1' || dupInput[FirstNonZeroPos] > '9')
-      ++FirstNonZeroPos;
-  }
-
-  unsigned it = FirstNonZeroPos;
-  int NValidNumber = 0;
-  while (it < dupInput.length()) {
-    if (dupInput[it] == '.') {
-      ++it;
-      continue;
-    }
-    ++NValidNumber;
-    if (NValidNumber > precision) dupInput[it] = '0';
-    ++it;
-  }
-
-  // Here we wanted to adjust the truncate position and the value
-  decPosition = dupInput.find('.');
-  if (decPosition == std::string::npos) // When this is integer
-    truncBitPosition = (int)dupInput.length();
-  else
-    for (truncBitPosition = (int)(dupInput.length() - 1); truncBitPosition >= 0;
-         --truncBitPosition) {
-      if (dupInput[truncBitPosition] == '.') break;
-      if (dupInput[truncBitPosition] != '0') {
-        truncBitPosition++;
-        break;
-      }
-    }
-
-  if (dupInput[0] == '1')
-    dupInput = dupInput.substr(0, truncBitPosition);
-  else
-    dupInput = dupInput.substr(1, truncBitPosition - 1);
-
-  decPosition = dupInput.find('.');
-  if (decPosition != std::string::npos) {
-    size_t it = 0;
-    for (it = decPosition + 1; dupInput[it] == '0'; it++)
-      ;
-    if (it - decPosition - 1 < 4) {
-      ans += dupInput;
-      return ans;
-    } else {
-      ans += scientificFormat(dupInput);
-      return ans;
-    }
-  } else if ((int)(dupInput.length()) <= precision) {
-    ans += dupInput;
-    return ans;
-  }
-
-  ans += scientificFormat(dupInput);
-  return ans;
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE void print(
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  if (_AP_I > 0) {
-    ap_int_base<_AP_I, _AP_S> p1;
-    p1.V = x.V >> (_AP_W - _AP_I);
-    print(p1.V); // print overlaod for .V should exit
-  } else {
-    printf("0");
-  }
-  printf(".");
-  if (_AP_I < _AP_W) {
-    ap_int_base<_AP_W - _AP_I, false> p2;
-    p2.V = _AP_ROOT_op_get_range(x.V, 0, _AP_W - _AP_I);
-    print(p2.V, false); // print overlaod for .V should exit
-  }
-}
-#endif // ifndef __SYNTHESIS__
-
-// XXX the following two functions have to exist in synthesis,
-// as some old HLS Video Library code uses the ostream overload,
-// although HLS will later delete I/O function call.
-
-/// Output streaming
-//-----------------------------------------------------------------------------
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::ostream& operator<<(
-    std::ostream& out,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  // TODO support std::ios_base::fmtflags
-  unsigned width = out.width();
-  unsigned precision = out.precision();
-  char fill = out.fill();
-  std::string str = x.to_string(10, _AP_S);
-  str = reduceToPrecision(str, precision);
-  if (width > str.length()) {
-    for (unsigned i = 0; i < width - str.length(); ++i)
-      out << fill;
-  }
-  out << str;
-  return out;
-}
-#endif // ifndef __SYNTHESIS__
-
-/// Input streaming
-// -----------------------------------------------------------------------------
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::istream& operator>>(
-    std::istream& in,
-    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  double d;
-  in >> d;
-  x = ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(d);
-  return in;
-}
-#endif
-#endif // ifndef AP_AUTOCC
-
-/// Operators mixing Integers with ap_fixed_base
-// -----------------------------------------------------------------------------
-#define AF_BIN_OP_WITH_INT_SF(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)     \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,           \
-            ap_o_mode _AP_O, int _AP_N>                                  \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<    \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                     \
-  operator BIN_OP(                                                       \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \
-      C_TYPE i_op) {                                                     \
-    return op.operator BIN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op));        \
-  }
-
-#define AF_BIN_OP_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)           \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
-  operator BIN_OP(                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      C_TYPE i_op) {                                                        \
-    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
-  operator BIN_OP(                                                          \
-      C_TYPE i_op,                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
-  }
-
-#define AF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE bool operator REL_OP(                                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      C_TYPE i_op) {                                                        \
-    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE bool operator REL_OP(                                              \
-      C_TYPE i_op,                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
-  }
-
-#define AF_ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)               \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
-            ap_o_mode _AP_O, int _AP_N>                                        \
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
-  operator ASSIGN_OP(                                                          \
-      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
-      C_TYPE i_op) {                                                           \
-    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }
-
-#define AF_ASSIGN_OP_WITH_INT_SF(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)  \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,       \
-            ap_o_mode _AP_O, int _AP_N>                              \
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&    \
-  operator ASSIGN_OP(                                                \
-      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
-      C_TYPE i_op) {                                                 \
-    return op.operator ASSIGN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op)); \
-  }
-
-#define ALL_AF_OP_WITH_INT(C_TYPE, BITS, SIGN)               \
-  AF_BIN_OP_WITH_INT(+, C_TYPE, (BITS), (SIGN), plus)     \
-  AF_BIN_OP_WITH_INT(-, C_TYPE, (BITS), (SIGN), minus)    \
-  AF_BIN_OP_WITH_INT(*, C_TYPE, (BITS), (SIGN), mult)     \
-  AF_BIN_OP_WITH_INT(/, C_TYPE, (BITS), (SIGN), div)      \
-  AF_BIN_OP_WITH_INT(&, C_TYPE, (BITS), (SIGN), logic)    \
-  AF_BIN_OP_WITH_INT(|, C_TYPE, (BITS), (SIGN), logic)    \
-  AF_BIN_OP_WITH_INT(^, C_TYPE, (BITS), (SIGN), logic)    \
-  AF_BIN_OP_WITH_INT_SF(>>, C_TYPE, (BITS), (SIGN), lhs)  \
-  AF_BIN_OP_WITH_INT_SF(<<, C_TYPE, (BITS), (SIGN), lhs)  \
-                                                          \
-  AF_ASSIGN_OP_WITH_INT(+=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(-=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(*=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(/=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(&=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(|=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT(^=, C_TYPE, (BITS), (SIGN))     \
-  AF_ASSIGN_OP_WITH_INT_SF(>>=, C_TYPE, (BITS), (SIGN)) \
-  AF_ASSIGN_OP_WITH_INT_SF(<<=, C_TYPE, (BITS), (SIGN)) \
-                                                          \
-  AF_REL_OP_WITH_INT(>, C_TYPE, (BITS), (SIGN))           \
-  AF_REL_OP_WITH_INT(<, C_TYPE, (BITS), (SIGN))           \
-  AF_REL_OP_WITH_INT(>=, C_TYPE, (BITS), (SIGN))          \
-  AF_REL_OP_WITH_INT(<=, C_TYPE, (BITS), (SIGN))          \
-  AF_REL_OP_WITH_INT(==, C_TYPE, (BITS), (SIGN))          \
-  AF_REL_OP_WITH_INT(!=, C_TYPE, (BITS), (SIGN))
-
-ALL_AF_OP_WITH_INT(bool, 1, false)
-ALL_AF_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_AF_OP_WITH_INT(signed char, 8, true)
-ALL_AF_OP_WITH_INT(unsigned char, 8, false)
-ALL_AF_OP_WITH_INT(short, _AP_SIZE_short, true)
-ALL_AF_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_AF_OP_WITH_INT(int, _AP_SIZE_int, true)
-ALL_AF_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_AF_OP_WITH_INT(long, _AP_SIZE_long, true)
-ALL_AF_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_AF_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_AF_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef ALL_AF_OP_WITH_INT
-#undef AF_BIN_OP_WITH_INT
-#undef AF_BIN_OP_WITH_INT_SF
-#undef AF_ASSIGN_OP_WITH_INT
-#undef AF_ASSIGN_OP_WITH_INT_SF
-#undef AF_REL_OP_WITH_INT
-
-/*
- * **********************************************************************
- * TODO
- * There is no operator defined with float/double/long double, so that
- * code like
- *   ap_fixed<8,4> a = 1.5f;
- *   a += 0.5f;
- * will fail in compilation.
- * Operator with warning about conversion might be wanted.
- * **********************************************************************
- */
-
-#define AF_BIN_OP_WITH_AP_INT(BIN_OP, RTYPE)                                \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType<    \
-      _AP_W, _AP_I, _AP_S>::RTYPE                                           \
-  operator BIN_OP(                                                          \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
-  }                                                                         \
-                                                                            \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
-      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
-  operator BIN_OP(                                                          \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
-    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }
-
-#define AF_REL_OP_WITH_AP_INT(REL_OP)                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE bool operator REL_OP(                                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
-    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                         \
-                                                                            \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
-  INLINE bool operator REL_OP(                                              \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
-    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
-  }
-
-#define AF_ASSIGN_OP_WITH_AP_INT(ASSIGN_OP)                                    \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
-  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
-  operator ASSIGN_OP(                                                          \
-      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
-      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                               \
-    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
-  }                                                                            \
-                                                                               \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
-  INLINE ap_int_base<_AP_W2, _AP_S2>& operator ASSIGN_OP(                      \
-      ap_int_base<_AP_W2, _AP_S2>& i_op,                                       \
-      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {     \
-    return i_op.operator ASSIGN_OP(op.to_ap_int_base());                       \
-  }
-
-AF_BIN_OP_WITH_AP_INT(+, plus)
-AF_BIN_OP_WITH_AP_INT(-, minus)
-AF_BIN_OP_WITH_AP_INT(*, mult)
-AF_BIN_OP_WITH_AP_INT(/, div)
-AF_BIN_OP_WITH_AP_INT(&, logic)
-AF_BIN_OP_WITH_AP_INT(|, logic)
-AF_BIN_OP_WITH_AP_INT(^, logic)
-
-#undef AF_BIN_OP_WITH_AP_INT
-
-AF_ASSIGN_OP_WITH_AP_INT(+=)
-AF_ASSIGN_OP_WITH_AP_INT(-=)
-AF_ASSIGN_OP_WITH_AP_INT(*=)
-AF_ASSIGN_OP_WITH_AP_INT(/=)
-AF_ASSIGN_OP_WITH_AP_INT(&=)
-AF_ASSIGN_OP_WITH_AP_INT(|=)
-AF_ASSIGN_OP_WITH_AP_INT(^=)
-
-#undef AF_ASSIGN_OP_WITH_AP_INT
-
-AF_REL_OP_WITH_AP_INT(==)
-AF_REL_OP_WITH_AP_INT(!=)
-AF_REL_OP_WITH_AP_INT(>)
-AF_REL_OP_WITH_AP_INT(>=)
-AF_REL_OP_WITH_AP_INT(<)
-AF_REL_OP_WITH_AP_INT(<=)
-
-#undef AF_REL_OP_WITH_AP_INT
-
-// Relational Operators with double
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator==(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator==(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator!=(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator!=(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator>(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator<(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator>=(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator<=(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator<(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator>(op1);
-}
-
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE bool operator<=(
-    double op1,
-    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
-  return op2.operator>=(op1);
-}
-
-#endif // ifndef __cplusplus else
-
-#endif // ifndef __AP_FIXED_BASE_H__ else
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_BASE_H__
+#define __AP_FIXED_BASE_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+// for ap_int_base and its reference types.
+#include <ap_int.h>
+#ifndef __SYNTHESIS__
+#if _AP_ENABLE_HALF_ == 1
+// for half type
+#include <hls_half.h>
+#endif
+// for std io
+#include <iostream>
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+#else // __cplusplus
+
+// for warning on unsupported rounding mode in conversion to float/double.
+#if !defined(__SYNTHESIS__) && __cplusplus >= 201103L && \
+    (defined(__gnu_linux__) || defined(_WIN32))
+#define AP_FIXED_ENABLE_CPP_FENV 1
+#include <cfenv>
+#endif
+
+// ----------------------------------------------------------------------
+
+/* Major TODO
+  long double support: constructor, assign and other operators.
+  binary operators with ap_fixed_base and const char*.
+  return ap_fixed/ap_ufixed when result signedness is known.
+*/
+
+// Helper function in conversion to floating point types.
+
+#ifdef __SYNTHESIS__
+#define _AP_ctype_op_get_bit(var, index) _AP_ROOT_op_get_bit(var, index)
+#define _AP_ctype_op_set_bit(var, index, x) _AP_ROOT_op_set_bit(var, index, x)
+#define _AP_ctype_op_get_range(var, low, high) \
+  _AP_ROOT_op_get_range(var, low, high)
+#define _AP_ctype_op_set_range(var, low, high, x) \
+  _AP_ROOT_op_set_range(var, low, high, x)
+#else // ifdef __SYNTHESIS__
+template <typename _Tp1, typename _Tp2>
+inline bool _AP_ctype_op_get_bit(_Tp1& var, const _Tp2& index) {
+  return !!(var & (1ull << (index)));
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ctype_op_set_bit(_Tp1& var, const _Tp2& index, const _Tp3& x) {
+  var |= (((x) ? 1ull : 0ull) << (index));
+  return var;
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3>
+inline _Tp1 _AP_ctype_op_get_range(_Tp1& var, const _Tp2& low,
+                                   const _Tp3& high) {
+  _Tp1 r = var;
+  ap_ulong mask = -1ll;
+  mask >>= (sizeof(_Tp1) * 8 - ((high) - (low) + 1));
+  r >>= (low);
+  r &= mask;
+  return r;
+}
+template <typename _Tp1, typename _Tp2, typename _Tp3, typename _Tp4>
+inline _Tp1 _AP_ctype_op_set_range(_Tp1& var, const _Tp2& low, const _Tp3& high,
+                                   const _Tp4& x) {
+  ap_ulong mask = -1ll;
+  mask >>= (_AP_SIZE_ap_slong - ((high) - (low) + 1));
+  var &= ~(mask << (low));
+  var |= ((mask & x) << (low));
+  return var;
+}
+#endif // ifdef __SYNTHESIS__
+
+
+// trait for letting base class to return derived class.
+// Notice that derived class template is incomplete, and we cannot use
+// the member of the derived class.
+template <int _AP_W2, int _AP_I2, bool _AP_S2>
+struct _ap_fixed_factory;
+template <int _AP_W2, int _AP_I2>
+struct _ap_fixed_factory<_AP_W2, _AP_I2, true> {
+  typedef ap_fixed<_AP_W2, _AP_I2> type;
+};
+template <int _AP_W2, int _AP_I2>
+struct _ap_fixed_factory<_AP_W2, _AP_I2, false> {
+  typedef ap_ufixed<_AP_W2, _AP_I2> type;
+};
+
+/// ap_fixed_base: AutoPilot fixed point.
+/** partial specialization of signed.
+  @tparam _AP_W width.
+  @tparam _AP_I integral part width.
+  @tparam _AP_S signed.
+  @tparam _AP_Q quantization mode. Default is AP_TRN.
+  @tparam _AP_O saturation mode. Default is AP_WRAP.
+  @tparam _AP_N saturation wrap value. Default is 0.
+ */
+// default for _AP_Q, _AP_O and _AP_N set in ap_decl.h
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct ap_fixed_base : _AP_ROOT_TYPE<_AP_W, _AP_S> {
+ public:
+  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
+  static const int width = _AP_W;
+  static const int iwidth = _AP_I;
+  static const ap_q_mode qmode = _AP_Q;
+  static const ap_o_mode omode = _AP_O;
+
+  /// Return type trait.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2>
+  struct RType {
+    enum {
+      _AP_F = _AP_W - _AP_I,
+      F2 = _AP_W2 - _AP_I2,
+      mult_w = _AP_W + _AP_W2,
+      mult_i = _AP_I + _AP_I2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
+               1 + AP_MAX(_AP_F, F2),
+      plus_i =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1 +
+          AP_MAX(_AP_F, F2),
+      minus_i =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+#ifndef __SC_COMPATIBLE__
+      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0),
+#else
+      div_w = _AP_S2 + _AP_W + AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
+#endif
+      div_i = _AP_S2 + _AP_I + F2,
+      div_s = _AP_S || _AP_S2,
+      logic_w =
+          AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)) +
+          AP_MAX(_AP_F, F2),
+      logic_i = AP_MAX(_AP_I + (_AP_S2 && !_AP_S), _AP_I2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+
+    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> lhs;
+    typedef ap_fixed_base<_AP_W2, _AP_I2, _AP_S2> rhs;
+
+    typedef ap_fixed_base<mult_w, mult_i, mult_s> mult_base;
+    typedef ap_fixed_base<plus_w, plus_i, plus_s> plus_base;
+    typedef ap_fixed_base<minus_w, minus_i, minus_s> minus_base;
+    typedef ap_fixed_base<logic_w, logic_i, logic_s> logic_base;
+    typedef ap_fixed_base<div_w, div_i, div_s> div_base;
+    typedef ap_fixed_base<_AP_W, _AP_I, _AP_S> arg1_base;
+
+    typedef typename _ap_fixed_factory<mult_w, mult_i, mult_s>::type mult;
+    typedef typename _ap_fixed_factory<plus_w, plus_i, plus_s>::type plus;
+    typedef typename _ap_fixed_factory<minus_w, minus_i, minus_s>::type minus;
+    typedef typename _ap_fixed_factory<logic_w, logic_i, logic_s>::type logic;
+    typedef typename _ap_fixed_factory<div_w, div_i, div_s>::type div;
+    typedef typename _ap_fixed_factory<_AP_W, _AP_I, _AP_S>::type arg1;
+  };
+
+ private:
+#ifndef __SYNTHESIS__
+  // This cannot handle hex float format string.
+  void fromString(const std::string& val, unsigned char radix) {
+    _AP_ERROR(!(radix == 2 || radix == 8 || radix == 10 || radix == 16),
+              "ap_fixed_base::fromString(%s, %d)", val.c_str(), radix);
+
+    Base::V = 0;
+    int startPos = 0;
+    int endPos = val.length();
+    int decPos = val.find(".");
+    if (decPos == -1) decPos = endPos;
+
+    // handle sign
+    bool isNegative = false;
+    if (val[0] == '-') {
+      isNegative = true;
+      ++startPos;
+    } else if (val[0] == '+')
+      ++startPos;
+
+    // If there are no integer bits, e.g.:
+    // .0000XXXX, then keep at least one bit.
+    // If the width is greater than the number of integer bits, e.g.:
+    // XXXX.XXXX, then we keep the integer bits
+    // if the number of integer bits is greater than the width, e.g.:
+    // XXX000 then we keep the integer bits.
+    // Always keep one bit.
+    ap_fixed_base<AP_MAX(_AP_I, 4) + 4, AP_MAX(_AP_I, 4) + 4, false>
+        integer_bits = 0;
+
+    // Figure out if we can shift instead of multiply
+    unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+    //std::cout << "\n\n" << val << "\n";
+    //std::cout << startPos << " " << decPos << " " << endPos << "\n";
+
+    bool sticky_int = false;
+
+    // Traverse the integer digits from the MSD, multiplying by radix as we go.
+    for (int i = startPos; i < decPos; i++) {
+      // Get a digit
+      char cdigit = val[i];
+      if (cdigit == '\0') continue;
+      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
+
+      sticky_int |= integer_bits[AP_MAX(_AP_I, 4) + 4 - 1] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 2] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] |
+                    integer_bits[AP_MAX(_AP_I, 4) + 4 - 4];
+      // Shift or multiply the value by the radix
+      if (shift)
+        integer_bits <<= shift;
+      else
+        integer_bits *= radix;
+
+      // Add in the digit we just interpreted
+      integer_bits += digit;
+      //std::cout << "idigit = " << digit << " " << integer_bits.to_string()
+      //    << "  " << sticky_int <<  "\n";
+    }
+    integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] =
+        integer_bits[AP_MAX(_AP_I, 4) + 4 - 3] | sticky_int;
+
+    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 0) + 4 + 4, 4, false> fractional_bits = 0;
+    bool sticky = false;
+
+    // Traverse the fractional digits from the LSD, dividing by radix as we go.
+    for (int i = endPos - 1; i >= decPos + 1; i--) {
+      // Get a digit
+      char cdigit = val[i];
+      if (cdigit == '\0') continue;
+      unsigned digit = ap_private_ops::decode_digit(cdigit, radix);
+      // Add in the digit we just interpreted
+      fractional_bits += digit;
+
+      sticky |= fractional_bits[0] | fractional_bits[1] | fractional_bits[2] |
+                fractional_bits[3];
+      // Shift or divide the value by the radix
+      if (shift)
+        fractional_bits >>= shift;
+      else
+        fractional_bits /= radix;
+
+      //std::cout << "fdigit = " << digit << " " << fractional_bits.to_string()
+      //    << " " << sticky << "\n";
+    }
+
+    //std::cout << "Int =" << integer_bits.to_string() << " " <<
+    //    fractional_bits.to_string() << "\n";
+
+    fractional_bits[0] = fractional_bits[0] | sticky;
+
+    if (isNegative)
+      *this = -(integer_bits + fractional_bits);
+    else
+      *this = integer_bits + fractional_bits;
+
+    //std::cout << "end = " << this->to_string(16) << "\n";
+  }
+
+  /// report invalid constrction of ap_fixed_base
+  INLINE void report() {
+    if (!_AP_S && _AP_O == AP_WRAP_SM) {
+      fprintf(stderr, "ap_ufxied<...> cannot support AP_WRAP_SM.\n");
+      exit(1);
+    }
+    if (_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024) {
+      fprintf(stderr,
+              "[E] ap_%sfixed<%d, ...>: Bitwidth exceeds the "
+              "default max value %d. Please use macro "
+              "AP_INT_MAX_W to set a larger max value.\n",
+              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
+      exit(1);
+    }
+  }
+#else
+  INLINE void report() {}
+#endif // ifdef __SYNTHESIS__
+
+  /// @name helper functions.
+  //  @{
+  INLINE void overflow_adjust(bool underflow, bool overflow, bool lD,
+                              bool sign) {
+    if (!underflow && !overflow) return;
+    if (_AP_O == AP_WRAP) {
+      if (_AP_N == 0) return;
+      if (_AP_S) {
+        // signed AP_WRAP
+        // n_bits == 1
+        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
+        if (_AP_N > 1) {
+          // n_bits > 1
+          ap_int_base<_AP_W, false> mask(-1);
+          if (sign) mask.V = 0;
+          Base::V =
+              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
+        }
+      } else {
+        // unsigned AP_WRAP
+        ap_int_base<_AP_W, false> mask(-1);
+        Base::V =
+            _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 1, mask.V);
+      }
+    } else if (_AP_O == AP_SAT_ZERO) {
+      Base::V = 0;
+    } else if (_AP_O == AP_WRAP_SM && _AP_S) {
+      bool Ro = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      if (_AP_N == 0) {
+        if (lD != Ro) {
+          Base::V = ~Base::V;
+          Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, lD);
+        }
+      } else {
+        if (_AP_N == 1 && sign != Ro) {
+          Base::V = ~Base::V;
+        } else if (_AP_N > 1) {
+          bool lNo = _AP_ROOT_op_get_bit(Base::V, _AP_W - _AP_N);
+          if (lNo == sign) Base::V = ~Base::V;
+          ap_int_base<_AP_W, false> mask(-1);
+          if (sign) mask.V = 0;
+          Base::V =
+              _AP_ROOT_op_set_range(Base::V, _AP_W - _AP_N, _AP_W - 2, mask.V);
+        }
+        Base::V = _AP_ROOT_op_set_bit(Base::V, _AP_W - 1, sign);
+      }
+    } else {
+      if (_AP_S) {
+        if (overflow) {
+          Base::V = 1;
+          Base::V <<= _AP_W - 1;
+          Base::V = ~Base::V;
+        } else if (underflow) {
+          Base::V = 1;
+          Base::V <<= _AP_W - 1;
+          if (_AP_O == AP_SAT_SYM) Base::V |= 1;
+        }
+      } else {
+        if (overflow)
+          Base::V = ~(ap_int_base<_AP_W, false>(0).V);
+        else if (underflow)
+          Base::V = 0;
+      }
+    }
+  }
+
+  INLINE bool quantization_adjust(bool qb, bool r, bool s) {
+    bool carry = (bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+    if (_AP_Q == AP_TRN) return false;
+    if (_AP_Q == AP_RND_ZERO)
+      qb &= s || r;
+    else if (_AP_Q == AP_RND_MIN_INF)
+      qb &= r;
+    else if (_AP_Q == AP_RND_INF)
+      qb &= !s || r;
+    else if (_AP_Q == AP_RND_CONV)
+      qb &= _AP_ROOT_op_get_bit(Base::V, 0) || r;
+    else if (_AP_Q == AP_TRN_ZERO)
+      qb = s && (qb || r);
+    Base::V += qb;
+    return carry && (!(bool)_AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
+  }
+  //  @}
+
+ public:
+  /// @name constructors.
+  //  @{
+  /// default ctor.
+  INLINE ap_fixed_base() {}
+
+  /// copy ctor.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(op);
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(op);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
+    tmp.V = op.V;
+    operator=(tmp);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
+    ap_fixed_base<_AP_W2, _AP_W2, _AP_S2> tmp;
+    tmp.V = op.V;
+    operator=(tmp);
+    report();
+  }
+
+#ifndef __SYNTHESIS__
+#ifndef NON_C99STRING
+  INLINE ap_fixed_base(const char* s, signed char rd = 0) {
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(s, radix); // will guess rd, default 10
+    _AP_ERROR(radix == 0, "ap_fixed_base(const char* \"%s\", %d), str=%s, radix = %d",
+              s, rd, str.c_str(), radix); // TODO remove this check
+    fromString(str, radix);
+  }
+#else
+  INLINE ap_fixed_base(const char* s, signed char rd = 10) {
+    ap_int_base<_AP_W, _AP_S> t(s, rd);
+    Base::V = t.V;
+  }
+#endif // ifndef NON_C99STRING
+#else // ifndef __SYNTHESIS__
+  // XXX _ssdm_string2bits only takes const string and const radix.
+  // It seems XFORM will do compile time processing of the string.
+  INLINE ap_fixed_base(const char* s) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_I, _AP_S, _AP_Q,
+                      _AP_O, _AP_N, _AP_C99);
+    Base::V = t;
+  }
+  INLINE ap_fixed_base(const char* s, signed char rd) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_I, _AP_S, _AP_Q,
+                      _AP_O, _AP_N, _AP_C99);
+    Base::V = t;
+  }
+#endif // ifndef __SYNTHESIS__ else
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    *this = ((bool)op);
+    report();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base(const ap_range_ref<_AP_W2, _AP_S2>& op) {
+    *this = (ap_int_base<_AP_W2, false>(op));
+    report();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_fixed_base(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op) {
+    *this = (ap_int_base<_AP_W2 + _AP_W3, false>(op));
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    *this = (bool(op));
+    report();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    *this = (ap_int_base<_AP_W2, false>(op));
+    report();
+  }
+
+  // ctors from c types.
+  // make a temp ap_fixed_base first, and use ap_fixed_base.operator=
+#define CTOR_FROM_INT(C_TYPE, _AP_W2, _AP_S2)        \
+  INLINE ap_fixed_base(const C_TYPE x) {             \
+    ap_fixed_base<(_AP_W2), (_AP_W2), (_AP_S2)> tmp; \
+    tmp.V = x;                                       \
+    *this = tmp;                                     \
+  }
+
+  CTOR_FROM_INT(bool, 1, false)
+  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  CTOR_FROM_INT(signed char, 8, true)
+  CTOR_FROM_INT(unsigned char, 8, false)
+  CTOR_FROM_INT(short, _AP_SIZE_short, true)
+  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  CTOR_FROM_INT(int, _AP_SIZE_int, true)
+  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  CTOR_FROM_INT(long, _AP_SIZE_long, true)
+  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+#undef CTOR_FROM_INT
+/*
+ * TODO:
+ *Theere used to be several funtions which were AP_WEAK.
+ *Now they're all INLINE expect ap_fixed_base(double d)
+ *Maybe we can use '#pragma HLS inline' instead of INLINE.
+ */
+  AP_WEAK ap_fixed_base(double d) {
+    ap_int_base<64, false> ireg;
+    ireg.V = doubleToRawBits(d);
+    bool isneg = _AP_ROOT_op_get_bit(ireg.V, 63);
+
+    ap_int_base<DOUBLE_EXP + 1, true> exp;
+    ap_int_base<DOUBLE_EXP, false> exp_tmp;
+    exp_tmp.V =
+        _AP_ROOT_op_get_range(ireg.V, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1);
+    exp = exp_tmp - DOUBLE_BIAS;
+    ap_int_base<DOUBLE_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(ireg.V, 0, DOUBLE_MAN - 1);
+    // do not support NaN
+    _AP_WARNING(exp == APFX_IEEE_DOUBLE_E_MAX + 1 && man.V != 0,
+                "assign NaN to fixed point value");
+    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
+    if (isneg) man = -man;
+    if ((ireg.V & 0x7fffffffffffffffLL) == 0) {
+      Base::V = 0;
+    } else {
+      int _AP_W2 = DOUBLE_MAN + 2, _AP_I2 = exp.V + 2, _AP_F = _AP_W - _AP_I,
+          F2 = _AP_W2 - _AP_I2;
+      bool _AP_S2 = true,
+           QUAN_INC = F2 > _AP_F &&
+                      !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
+      bool carry = false;
+      // handle quantization
+      unsigned sh_amt = (F2 > _AP_F) ? F2 - _AP_F : _AP_F - F2;
+      if (F2 == _AP_F)
+        Base::V = man.V;
+      else if (F2 > _AP_F) {
+        if (sh_amt < DOUBLE_MAN + 2)
+          Base::V = man.V >> sh_amt;
+        else {
+          Base::V = isneg ? -1 : 0;
+        }
+        if ((_AP_Q != AP_TRN) && !((_AP_Q == AP_TRN_ZERO) && !_AP_S2)) {
+          bool qb = (F2 - _AP_F > _AP_W2) ? isneg : (bool)_AP_ROOT_op_get_bit(
+                                                        man.V, F2 - _AP_F - 1);
+          bool r =
+              (F2 > _AP_F + 1)
+                  ? _AP_ROOT_op_get_range(man.V, 0, (F2 - _AP_F - 2 < _AP_W2)
+                                                        ? (F2 - _AP_F - 2)
+                                                        : (_AP_W2 - 1)) != 0
+                  : false;
+          carry = quantization_adjust(qb, r, isneg);
+        }
+      } else { // no quantization
+        Base::V = man.V;
+        if (sh_amt < _AP_W)
+          Base::V = Base::V << sh_amt;
+        else
+          Base::V = 0;
+      }
+      // handle overflow/underflow
+      if ((_AP_O != AP_WRAP || _AP_N != 0) &&
+          ((!_AP_S && _AP_S2) ||
+           _AP_I - _AP_S <
+               _AP_I2 - _AP_S2 +
+                   (QUAN_INC ||
+                    (_AP_S2 && (_AP_O == AP_SAT_SYM))))) { // saturation
+        bool deleted_zeros = _AP_S2 ? true : !carry, deleted_ones = true;
+        bool neg_src = isneg;
+        bool lD = false;
+        int pos1 = F2 - _AP_F + _AP_W;
+        int pos2 = F2 - _AP_F + _AP_W + 1;
+        bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+        if (pos1 < _AP_W2 && pos1 >= 0)
+          // lD = _AP_ROOT_op_get_bit(man.V, pos1);
+          lD = (man.V >> pos1) & 1;
+        if (pos1 < _AP_W2) {
+          bool Range1_all_ones = true;
+          bool Range1_all_zeros = true;
+          bool Range2_all_ones = true;
+          ap_int_base<DOUBLE_MAN + 2, false> Range2;
+          ap_int_base<DOUBLE_MAN + 2, false> all_ones(-1);
+
+          if (pos2 >= 0 && pos2 < _AP_W2) {
+            // Range2.V = _AP_ROOT_op_get_range(man.V,
+            //                        pos2, _AP_W2 - 1);
+            Range2.V = man.V;
+            Range2.V >>= pos2;
+            Range2_all_ones = Range2 == (all_ones >> pos2);
+          } else if (pos2 < 0)
+            Range2_all_ones = false;
+          if (pos1 >= 0 && pos2 < _AP_W2) {
+            Range1_all_ones = Range2_all_ones && lD;
+            Range1_all_zeros = !Range2.V && !lD;
+          } else if (pos2 == _AP_W2) {
+            Range1_all_ones = lD;
+            Range1_all_zeros = !lD;
+          } else if (pos1 < 0) {
+            Range1_all_zeros = !man.V;
+            Range1_all_ones = false;
+          }
+
+          deleted_zeros =
+              deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
+          deleted_ones =
+              carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
+          neg_src = isneg && !(carry && Range1_all_ones);
+        } else
+          neg_src = isneg && newsignbit;
+        bool neg_trg = _AP_S && newsignbit;
+        bool overflow = (neg_trg || !deleted_zeros) && !isneg;
+        bool underflow = (!neg_trg || !deleted_ones) && neg_src;
+        if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
+          underflow |=
+              neg_src &&
+              (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
+                         : true);
+        overflow_adjust(underflow, overflow, lD, neg_src);
+      }
+    }
+    report();
+  }
+
+  // TODO more optimized implementation.
+  INLINE ap_fixed_base(float d) { *this = ap_fixed_base(double(d)); }
+
+#if _AP_ENABLE_HALF_ == 1
+  // TODO more optimized implementation.
+  INLINE ap_fixed_base(half d) { *this = ap_fixed_base(double(d)); }
+#endif
+  //  @}
+
+  /// @name assign operator
+  /// assign, using another ap_fixed_base of same template parameters.
+  /*
+  INLINE ap_fixed_base& operator=(
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
+    Base::V = op.V;
+    return *this;
+  }
+  */
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+
+    const int _AP_F = _AP_W - _AP_I;
+    const int F2 = _AP_W2 - _AP_I2;
+    const int QUAN_INC =
+          F2 > _AP_F && !(_AP_Q == AP_TRN || (_AP_Q == AP_TRN_ZERO && !_AP_S2));
+
+    if (!op) Base::V = 0;
+    bool carry = false;
+    bool signbit = _AP_ROOT_op_get_bit(op.V, _AP_W2 - 1);
+    bool isneg = signbit && _AP_S2;
+    if (F2 == _AP_F)
+      Base::V = op.V;
+    else if (F2 > _AP_F) {
+      unsigned int sh_amt = F2 - _AP_F;
+      //  moves bits right, handle quantization.
+      if (sh_amt < _AP_W2) {
+        Base::V = op.V >> sh_amt;
+      } else {
+        Base::V = isneg ? -1 : 0;
+      }
+      if (_AP_Q != AP_TRN && !(_AP_Q == AP_TRN_ZERO && !_AP_S2)) {
+        bool qbit = _AP_ROOT_op_get_bit(op.V, F2 - _AP_F - 1);
+        // bit after LSB.
+        bool qb = (F2 - _AP_F > _AP_W2) ? _AP_S2 && signbit : qbit;
+        enum { hi = ((F2 - _AP_F - 2) < _AP_W2) ? (F2 - _AP_F - 2) : (_AP_W2 - 1) };
+        // bits after qb.
+        bool r = (F2 > _AP_F + 1) ? (_AP_ROOT_op_get_range(op.V, 0, hi) != 0) : false;
+        carry = quantization_adjust(qb, r, isneg);
+      }
+    } else {
+      unsigned  sh_amt = _AP_F - F2;
+      // moves bits left, no quantization
+      if (sh_amt < _AP_W) {
+        if (_AP_W > _AP_W2) {
+          // extend and then shift, avoid losing bits.
+          Base::V = op.V;
+          Base::V <<= sh_amt;
+        } else {
+          // shift and truncate.
+          Base::V = op.V << sh_amt;
+        }
+      } else {
+        Base::V = 0;
+      }
+    }
+    // handle overflow/underflow
+    if ((_AP_O != AP_WRAP || _AP_N != 0) &&
+        ((!_AP_S && _AP_S2) ||
+         _AP_I - _AP_S <
+             _AP_I2 - _AP_S2 +
+                 (QUAN_INC || (_AP_S2 && _AP_O == AP_SAT_SYM)))) { // saturation
+      bool deleted_zeros = _AP_S2 ? true : !carry;
+      bool deleted_ones = true;
+      bool neg_src = isneg;
+      bool newsignbit = _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      enum { pos1 = F2 - _AP_F + _AP_W, pos2 = F2 - _AP_F + _AP_W + 1 };
+      bool lD = (pos1 < _AP_W2 && pos1 >= 0) ? _AP_ROOT_op_get_bit(op.V, pos1)
+                                             : false;
+      if (pos1 < _AP_W2) {
+        bool Range1_all_ones = true;
+        bool Range1_all_zeros = true;
+        bool Range2_all_ones = true;
+        ap_int_base<_AP_W2, false> all_ones(-1);
+
+        if (pos2 < _AP_W2 && pos2 >= 0) {
+          ap_int_base<_AP_W2, false> Range2;
+          Range2.V = _AP_ROOT_op_get_range(op.V, pos2, _AP_W2 - 1);
+          Range2_all_ones = Range2 == (all_ones >> pos2);
+        } else if (pos2 < 0) {
+          Range2_all_ones = false;
+        }
+
+        if (pos1 >= 0 && pos2 < _AP_W2) {
+          ap_int_base<_AP_W2, false> Range1;
+          Range1.V = _AP_ROOT_op_get_range(op.V, pos1, _AP_W2 - 1);
+          Range1_all_ones = Range1 == (all_ones >> pos1);
+          Range1_all_zeros = !Range1.V;
+        } else if (pos2 == _AP_W2) {
+          Range1_all_ones = lD;
+          Range1_all_zeros = !lD;
+        } else if (pos1 < 0) {
+          Range1_all_zeros = !op.V;
+          Range1_all_ones = false;
+        }
+
+        deleted_zeros =
+            deleted_zeros && (carry ? Range1_all_ones : Range1_all_zeros);
+        deleted_ones =
+            carry ? Range2_all_ones && (pos1 < 0 || !lD) : Range1_all_ones;
+        neg_src = isneg && !(carry && Range1_all_ones);
+      } else
+        neg_src = isneg && newsignbit;
+      bool neg_trg = _AP_S && newsignbit;
+      bool overflow = (neg_trg || !deleted_zeros) && !isneg;
+      bool underflow = (!neg_trg || !deleted_ones) && neg_src;
+      if ((_AP_O == AP_SAT_SYM) && _AP_S2 && _AP_S)
+        underflow |=
+            neg_src &&
+            (_AP_W > 1 ? _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 2) == 0
+                       : true);
+
+      overflow_adjust(underflow, overflow, lD, neg_src);
+    }
+    return *this;
+  } // operator= 
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator=(
+      const volatile ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    operator=(const_cast<const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(op));
+    return *this;
+  }
+
+  /// Set this ap_fixed_base with ULL.
+  INLINE ap_fixed_base& setBits(ap_ulong bv) {
+    // TODO when ull is not be long enough...
+    Base::V = bv;
+    return *this;
+  }
+
+  /// Return a ap_fixed_base object whose this->V is assigned by bv.
+  static INLINE ap_fixed_base bitsToFixed(ap_ulong bv) {
+    // TODO fix when ull is not be long enough...
+    ap_fixed_base t;
+#ifdef __SYNTHESIS__
+    t.V = bv;
+#else
+    t.V.set_bits(bv);
+#endif
+    return t;
+  }
+
+  // Explicit conversion functions to ap_int_base.
+  /** Captures all integer bits, in truncate mode.
+   *  @param[in] Cnative follow conversion from double to int.
+   */
+  INLINE ap_int_base<AP_MAX(_AP_I, 1), _AP_S> to_ap_int_base(
+      bool Cnative = true) const {
+    ap_int_base<AP_MAX(_AP_I, 1), _AP_S> ret;
+    if (_AP_I == 0) {
+      ret.V = 0;
+    } else if (_AP_I > 0 && _AP_I <= _AP_W) {
+      ret.V = _AP_ROOT_op_get_range(Base::V, _AP_W - _AP_I, _AP_W - 1);
+    } else if (_AP_I > _AP_W) {
+      ret.V = _AP_ROOT_op_get_range(Base::V, 0, _AP_W - 1);
+      ret.V <<= (_AP_I - _AP_W);
+    }
+    /* Consider the following case
+     *   float f = -7.5f;
+     *   ap_fixed<8,4> t = f;  // -8 0 0 0 . 0.5
+     *   int i = t.to_int();
+     * the result should be -7 instead of -8.
+     * Therefore, after truncation, the value should be increated by 1.
+     * For (-1, 0), carry to MSB will happen, but result 0 is still correct.
+     */
+    if (Cnative && _AP_I < _AP_W) {
+      // Follow C native data type, conversion from double to int
+      if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1) && (_AP_I < _AP_W) &&
+          (_AP_ROOT_op_get_range(
+               Base::V, 0, _AP_I < 0 ? _AP_W - 1 : _AP_W - _AP_I - 1) != 0))
+        ++ret;
+    } else {
+      // Follow OSCI library, conversion from sc_fixed to sc_int
+    }
+    return ret;
+  };
+
+ public:
+  template <int _AP_W2, bool _AP_S2>
+  INLINE operator ap_int_base<_AP_W2, _AP_S2>() const {
+    return ap_int_base<_AP_W2, _AP_S2>(to_ap_int_base());
+  }
+
+  // Explicit conversion function to C built-in integral type.
+  INLINE char to_char() const { return to_ap_int_base().to_char(); }
+
+  INLINE int to_int() const { return to_ap_int_base().to_int(); }
+
+  INLINE unsigned to_uint() const { return to_ap_int_base().to_uint(); }
+
+  INLINE ap_slong to_int64() const { return to_ap_int_base().to_int64(); }
+
+  INLINE ap_ulong to_uint64() const { return to_ap_int_base().to_uint64(); }
+
+  /// covert function to double.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE double to_double() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = DOUBLE_MAN + DOUBLE_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros(); ///< number of leading zeros.
+    int e = _AP_I - l - 1 + DOUBLE_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - DOUBLE_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // ull is at least 64-bit
+    ap_ulong m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (ap_ulong)(tmp.V >> (lsb_index - 1))
+                           : (ap_ulong)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (ap_ulong)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    //std::cout << '\n' << std::hex << m << '\n'; // TODO delete this
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, DOUBLE_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    //std::cout << m << '\n'; // TODO delete this
+    m = _AP_ctype_op_set_range(m, DOUBLE_MAN, DOUBLE_MAN + DOUBLE_EXP - 1, e);
+    //std::cout << std::hex << m << std::dec << std::endl; // TODO delete this
+    // cast to fp
+    return rawBitsToDouble(m);
+  }
+
+  /// convert function to float.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE float to_float() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = FLOAT_MAN + FLOAT_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
+    int e = _AP_I - l - 1 + FLOAT_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - FLOAT_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // ul is at least 32-bit
+    unsigned long m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (unsigned long)(tmp.V >> (lsb_index - 1))
+                           : (unsigned long)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (unsigned long)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, FLOAT_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    m = _AP_ctype_op_set_range(m, FLOAT_MAN, FLOAT_MAN + FLOAT_EXP - 1, e);
+    // cast to fp
+    return rawBitsToFloat(m);
+  }
+
+#if _AP_ENABLE_HALF_ == 1
+  /// convert function to half.
+  /** only round-half-to-even mode supported, does not obey FE env. */
+  INLINE half to_half() const {
+#if defined(AP_FIXED_ENABLE_CPP_FENV)
+    _AP_WARNING(std::fegetround() != FE_TONEAREST,
+                "Only FE_TONEAREST is supported");
+#endif
+    enum { BITS = HALF_MAN + HALF_EXP + 1 };
+    if (!Base::V) return 0.0f;
+    bool s = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1); ///< sign.
+    ap_int_base<_AP_W, false> tmp;
+    if (s)
+      tmp.V = -Base::V; // may truncate one bit extra from neg in sim.
+    else
+      tmp.V = Base::V;
+    int l = tmp.countLeadingZeros();  ///< number of leading zeros.
+    int e = _AP_I - l - 1 + HALF_BIAS; ///< exponent
+    int lsb_index = _AP_W - l - 1 - HALF_MAN;
+    // more than 0.5?
+    bool a = (lsb_index >=2) ?
+        (_AP_ROOT_op_get_range(tmp.V, 0, lsb_index - 2) != 0) : 0;
+    // round to even
+    a |= (lsb_index >=0) ? _AP_ROOT_op_get_bit(tmp.V, lsb_index) : 0;
+    // short is at least 16-bit
+    unsigned short m;
+    // may actually left shift, ensure buffer is wide enough.
+    if (_AP_W > BITS) {
+      m = (lsb_index >= 1) ? (unsigned short)(tmp.V >> (lsb_index - 1))
+                           : (unsigned short)(tmp.V << (1 - lsb_index));
+    } else {
+      m = (unsigned short)tmp.V;
+      m = (lsb_index >= 1) ? (m >> (lsb_index - 1))
+                           : (m << (1 - lsb_index));
+    }
+    m += a;
+    m >>= 1;
+    // carry to MSB, increase exponent
+    if (_AP_ctype_op_get_bit(m, HALF_MAN + 1)) {
+      e += 1;
+    }
+    // set sign and exponent
+    m = _AP_ctype_op_set_bit(m, BITS - 1, s);
+    m = _AP_ctype_op_set_range(m, HALF_MAN, HALF_MAN + HALF_EXP - 1, e);
+    // cast to fp
+    return rawBitsToHalf(m);
+  }
+#endif
+
+  // FIXME inherited from old code, this may loose precision!
+  INLINE operator long double() const { return (long double)to_double(); }
+
+  INLINE operator double() const { return to_double(); }
+
+  INLINE operator float() const { return to_float(); }
+
+#if _AP_ENABLE_HALF_ == 1
+  INLINE operator half() const { return to_half(); }
+#endif
+
+  INLINE operator bool() const { return (bool)Base::V != 0; }
+
+  INLINE operator char() const { return (char)to_int(); }
+
+  INLINE operator signed char() const { return (signed char)to_int(); }
+
+  INLINE operator unsigned char() const { return (unsigned char)to_uint(); }
+
+  INLINE operator short() const { return (short)to_int(); }
+
+  INLINE operator unsigned short() const { return (unsigned short)to_uint(); }
+
+  INLINE operator int() const { return to_int(); }
+
+  INLINE operator unsigned int() const { return to_uint(); }
+
+// FIXME don't assume data width...
+#ifdef __x86_64__
+  INLINE operator long() const { return (long)to_int64(); }
+
+  INLINE operator unsigned long() const { return (unsigned long)to_uint64(); }
+#else
+  INLINE operator long() const { return (long)to_int(); }
+
+  INLINE operator unsigned long() const { return (unsigned long)to_uint(); }
+#endif // ifdef __x86_64__ else
+
+  INLINE operator ap_ulong() const { return to_uint64(); }
+
+  INLINE operator ap_slong() const { return to_int64(); }
+
+  INLINE int length() const { return _AP_W; };
+
+  // bits_to_int64 deleted.
+#ifndef __SYNTHESIS__
+  // Used in autowrap, when _AP_W < 64.
+  INLINE ap_ulong bits_to_uint64() const {
+    return (Base::V).to_uint64();
+  }
+#endif
+
+  // Count the number of zeros from the most significant bit
+  // to the first one bit. Note this is only for ap_fixed_base whose
+  // _AP_W <= 64, otherwise will incur assertion.
+  INLINE int countLeadingZeros() {
+#ifdef __SYNTHESIS__
+    // TODO: used llvm.ctlz intrinsic ?
+    if (_AP_W <= 32) {
+      ap_int_base<32, false> t(-1ULL);
+      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
+      return __builtin_ctz(t.V);
+    } else if (_AP_W <= 64) {
+      ap_int_base<64, false> t(-1ULL);
+      t.range(_AP_W - 1, 0) = this->range(0, _AP_W - 1);
+      return __builtin_ctzll(t.V);
+    } else {
+      enum {__N = (_AP_W + 63) / 64};
+      int NZeros = 0;
+      int i = 0;
+      bool hitNonZero = false;
+      for (i = 0; i < __N - 1; ++i) {
+        ap_int_base<64, false> t;
+        t.range(0, 63) = this->range(_AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
+        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V);
+        hitNonZero |= (t != 0);
+      }
+      if (!hitNonZero) {
+        ap_int_base<64, false> t(-1ULL);
+        t.range(63 - (_AP_W - 1) % 64, 63) = this->range(0, (_AP_W - 1) % 64);
+        NZeros += __builtin_clzll(t.V);
+      }
+      return NZeros;
+    }
+#else
+    return Base::V.countLeadingZeros();
+#endif
+  }
+
+  // Arithmetic : Binary
+  // -------------------------------------------------------------------------
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::mult operator*(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
+      const {
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::mult_base r, t;
+    r.V = Base::V;
+    t.V = op2.V;
+    r.V *= op2.V;
+    return r;
+  }
+
+  // multiply function deleted.
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::div operator/(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2)
+      const {
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::div_base r;
+#ifndef __SYNTHESIS__
+    enum {F2 = _AP_W2-_AP_I2,
+              _W1=AP_MAX(_AP_W + AP_MAX(F2, 0) + ((_AP_S2 && !_AP_S) ? 1 : 0), _AP_W2 + ((_AP_S && !_AP_S2) ? 1 : 0))};
+    ap_int_base<_W1,_AP_S||_AP_S2> dividend,divisior;
+    ap_int_base<_W1,_AP_S> tmp1;
+    ap_int_base<_W1,_AP_S2> tmp2;
+    tmp1.V = Base::V;
+    tmp1.V <<= AP_MAX(F2,0);
+    tmp2.V = op2.V;
+    dividend = tmp1;
+    divisior = tmp2;
+    r.V = ((_AP_S||_AP_S2) ? dividend.V.sdiv(divisior.V): dividend.V.udiv(divisior.V));
+#else
+    #ifndef __SC_COMPATIBLE__
+        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0),_AP_I, _AP_S> t(*this);
+    #else
+        ap_fixed_base<_AP_W + AP_MAX(_AP_W2 - _AP_I2, 0) + AP_MAX(_AP_I2, 0),_AP_I, _AP_S> t(*this);
+    #endif
+        r.V = t.V / op2.V;
+#endif
+/*
+    enum {
+      F2 = _AP_W2 - _AP_I2,
+      shl = AP_MAX(F2, 0) + AP_MAX(_AP_I2, 0),
+#ifndef __SC_COMPATIBLE__
+      shr = AP_MAX(_AP_I2, 0),
+#else
+      shr = 0,
+#endif
+      W3 = _AP_S2 + _AP_W + shl,
+      S3 = _AP_S || _AP_S2,
+    };
+    ap_int_base<W3, S3> dividend, t;
+    dividend.V = Base::V;
+    // multiply both by (1 << F2), and than do integer division.
+    dividend.V <<= (int) shl;
+#ifdef __SYNTHESIS__
+    // .V's have right signedness, and will have right extending.
+    t.V = dividend.V / op2.V;
+#else
+    // XXX op2 may be wider than dividend, and sdiv and udiv takes the same with
+    // as left hand operand, so data might be truncated by mistake if not
+    // handled here.
+    t.V = S3 ? dividend.V.sdiv(op2.V) : dividend.V.udiv(op2.V);
+#endif
+    r.V = t.V >> (int) shr;
+*/
+    return r;
+  }
+
+#define OP_BIN_AF(Sym, Rty)                                                \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
+            ap_o_mode _AP_O2, int _AP_N2>                                  \
+  INLINE typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty operator Sym(         \
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
+          op2) const {                                                     \
+    typename RType<_AP_W2, _AP_I2, _AP_S2>::Rty##_base ret, lhs(*this),    \
+        rhs(op2);                                                          \
+    ret.V = lhs.V Sym rhs.V;                                               \
+    return ret;                                                            \
+  }
+
+  OP_BIN_AF(+, plus)
+  OP_BIN_AF(-, minus)
+  OP_BIN_AF(&, logic)
+  OP_BIN_AF(|, logic)
+  OP_BIN_AF(^, logic)
+
+// Arithmetic : assign
+// -------------------------------------------------------------------------
+#define OP_ASSIGN_AF(Sym)                                                  \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,         \
+            ap_o_mode _AP_O2, int _AP_N2>                                  \
+  INLINE ap_fixed_base& operator Sym##=(                                   \
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& \
+          op2) {                                                           \
+    *this = operator Sym(op2);                                             \
+    return *this;                                                          \
+  }
+
+  OP_ASSIGN_AF(*)
+  OP_ASSIGN_AF(/)
+  OP_ASSIGN_AF(+)
+  OP_ASSIGN_AF(-)
+  OP_ASSIGN_AF(&)
+  OP_ASSIGN_AF(|)
+  OP_ASSIGN_AF(^)
+
+  // Prefix and postfix increment and decrement.
+  // -------------------------------------------------------------------------
+
+  /// Prefix increment
+  INLINE ap_fixed_base& operator++() {
+    operator+=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
+    return *this;
+  }
+
+  /// Prefix decrement.
+  INLINE ap_fixed_base& operator--() {
+    operator-=(ap_fixed_base<_AP_W - _AP_I + 1, 1, false>(1));
+    return *this;
+  }
+
+  /// Postfix increment
+  INLINE const ap_fixed_base operator++(int) {
+    ap_fixed_base r(*this);
+    operator++();
+    return r;
+  }
+
+  /// Postfix decrement
+  INLINE const ap_fixed_base operator--(int) {
+    ap_fixed_base r(*this);
+    operator--();
+    return r;
+  }
+
+  // Unary arithmetic.
+  // -------------------------------------------------------------------------
+  INLINE ap_fixed_base operator+() { return *this; }
+
+  INLINE ap_fixed_base<_AP_W + 1, _AP_I + 1, true> operator-() const {
+    ap_fixed_base<_AP_W + 1, _AP_I + 1, true> r(*this);
+    r.V = -r.V;
+    return r;
+  }
+
+  INLINE ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> getNeg() {
+    ap_fixed_base<_AP_W, _AP_I, true, _AP_Q, _AP_O, _AP_N> r(*this);
+    r.V = -r.V;
+    return r;
+  }
+
+  // Not (!)
+  // -------------------------------------------------------------------------
+  INLINE bool operator!() const { return Base::V == 0; }
+
+  // Bitwise complement
+  // -------------------------------------------------------------------------
+  // XXX different from Mentor's ac_fixed.
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S> operator~() const {
+    ap_fixed_base<_AP_W, _AP_I, _AP_S> r;
+    r.V = ~Base::V;
+    return r;
+  }
+
+  // Shift
+  // -------------------------------------------------------------------------
+  // left shift is the same as moving point right, i.e. increate I.
+  template <int _AP_SHIFT>
+  INLINE ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> lshift() const {
+    ap_fixed_base<_AP_W, _AP_I + _AP_SHIFT, _AP_S> r;
+    r.V = Base::V;
+    return r;
+  }
+
+  template <int _AP_SHIFT>
+  INLINE ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> rshift() const {
+    ap_fixed_base<_AP_W, _AP_I - _AP_SHIFT, _AP_S> r;
+    r.V = Base::V;
+    return r;
+  }
+
+  // Because the return type is the type of the the first operand, shift assign
+  // operators do not carry out any quantization or overflow
+  // While systemc, shift assigns for sc_fixed/sc_ufixed will result in
+  // quantization or overflow (depending on the mode of the first operand)
+  INLINE ap_fixed_base operator<<(unsigned int sh) const {
+    ap_fixed_base r;
+    r.V = Base::V << sh;
+// TODO check shift overflow?
+#ifdef __SC_COMPATIBLE__
+    if (sh == 0) return r;
+    if (_AP_O != AP_WRAP || _AP_N != 0) {
+      bool neg_src = _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1);
+      bool allones, allzeros;
+      ap_int_base<_AP_W, false> ones(-1);
+      if (sh <= _AP_W) {
+        ap_int_base<_AP_W, false> range1;
+        range1.V = _AP_ROOT_op_get_range(
+            const_cast<ap_fixed_base*>(this)->Base::V, _AP_W - sh, _AP_W - 1);
+        allones = range1 == (ones >> (_AP_W - sh));
+        allzeros = range1 == 0;
+      } else {
+        allones = false;
+        allzeros = Base::V == 0;
+      }
+      bool overflow = !allzeros && !neg_src;
+      bool underflow = !allones && neg_src;
+      if ((_AP_O == AP_SAT_SYM) && _AP_S)
+        underflow |=
+            neg_src &&
+            (_AP_W > 1 ? _AP_ROOT_op_get_range(r.V, 0, _AP_W - 2) == 0 : true);
+      bool lD = false;
+      if (sh < _AP_W) lD = _AP_ROOT_op_get_bit(Base::V, _AP_W - sh - 1);
+      r.overflow_adjust(underflow, overflow, lD, neg_src);
+    }
+#endif
+    return r;
+  }
+
+  INLINE ap_fixed_base operator>>(unsigned int sh) const {
+    ap_fixed_base r;
+    r.V = Base::V >> sh;
+// TODO check shift overflow?
+#ifdef __SC_COMPATIBLE__
+    if (sh == 0) return r;
+    if (_AP_Q != AP_TRN) {
+      bool qb = false;
+      if (sh <= _AP_W) qb = _AP_ROOT_op_get_bit(Base::V, sh - 1);
+      bool rb = false;
+      if (sh > 1 && sh <= _AP_W)
+        rb = _AP_ROOT_op_get_range(const_cast<ap_fixed_base*>(this)->Base::V, 0,
+                                   sh - 2) != 0;
+      else if (sh > _AP_W)
+        rb = Base::V != 0;
+      r.quantization_adjust(qb, rb,
+                            _AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1));
+    }
+#endif
+    return r;
+  }
+
+  // left and right shift for int
+  INLINE ap_fixed_base operator<<(int sh) const {
+    ap_fixed_base r;
+    bool isNeg = sh < 0;
+    unsigned int ush = isNeg ? -sh : sh;
+    if (isNeg) {
+      return operator>>(ush);
+    } else {
+      return operator<<(ush);
+    }
+  }
+
+  INLINE ap_fixed_base operator>>(int sh) const {
+    bool isNeg = sh < 0;
+    unsigned int ush = isNeg ? -sh : sh;
+    if (isNeg) {
+      return operator<<(ush);
+    } else {
+      return operator>>(ush);
+    }
+  }
+
+  // left and right shift for ap_int.
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, true>& op2) const {
+    // TODO the code seems not optimal. ap_fixed<8,8> << ap_int<2> needs only a
+    // small mux, but integer need a big one!
+    int sh = op2.to_int();
+    return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, true>& op2) const {
+    int sh = op2.to_int();
+    return operator>>(sh);
+  }
+
+  // left and right shift for ap_uint.
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator<<(const ap_int_base<_AP_W2, false>& op2) const {
+    unsigned int sh = op2.to_uint();
+    return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_fixed_base operator>>(const ap_int_base<_AP_W2, false>& op2) const {
+    unsigned int sh = op2.to_uint();
+    return operator>>(sh);
+  }
+
+  // left and right shift for ap_fixed
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base operator<<(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          op2) {
+    return operator<<(op2.to_ap_int_base());
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base operator>>(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          op2) {
+    return operator>>(op2.to_ap_int_base());
+  }
+
+  // Shift assign.
+  // -------------------------------------------------------------------------
+
+  // left shift assign.
+  INLINE ap_fixed_base& operator<<=(const int sh) {
+    *this = operator<<(sh);
+    return *this;
+  }
+
+  INLINE ap_fixed_base& operator<<=(const unsigned int sh) {
+    *this = operator<<(sh);
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base& operator<<=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
+    *this = operator<<(sh.to_int());
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator<<=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          sh) {
+    *this = operator<<(sh.to_int());
+    return *this;
+  }
+
+  // right shift assign.
+  INLINE ap_fixed_base& operator>>=(const int sh) {
+    *this = operator>>(sh);
+    return *this;
+  }
+
+  INLINE ap_fixed_base& operator>>=(const unsigned int sh) {
+    *this = operator>>(sh);
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_fixed_base& operator>>=(const ap_int_base<_AP_W2, _AP_S2>& sh) {
+    *this = operator>>(sh.to_int());
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_fixed_base& operator>>=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          sh) {
+    *this = operator>>(sh.to_int());
+    return *this;
+  }
+
+// Comparisons.
+// -------------------------------------------------------------------------
+#define OP_CMP_AF(Sym)                                                         \
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,             \
+            ap_o_mode _AP_O2, int _AP_N2>                                      \
+  INLINE bool operator Sym(const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, \
+                                               _AP_O2, _AP_N2>& op2) const {   \
+    enum { _AP_F = _AP_W - _AP_I, F2 = _AP_W2 - _AP_I2 };                      \
+    if (_AP_F == F2)                                                           \
+      return Base::V Sym op2.V;                                                \
+    else if (_AP_F > F2)                                                       \
+      return Base::V Sym ap_fixed_base<AP_MAX(_AP_W2 + _AP_F - F2, 1), _AP_I2, \
+                                       _AP_S2, _AP_Q2, _AP_O2, _AP_N2>(op2).V; \
+    else                                                                       \
+      return ap_fixed_base<AP_MAX(_AP_W + F2 - _AP_F + 1, 1), _AP_I + 1,       \
+                           _AP_S, _AP_Q, _AP_O, _AP_N>(*this).V Sym op2.V;     \
+    return false;                                                              \
+  }
+
+  OP_CMP_AF(>)
+  OP_CMP_AF(<)
+  OP_CMP_AF(>=)
+  OP_CMP_AF(<=)
+  OP_CMP_AF(==)
+  OP_CMP_AF(!=)
+// FIXME: Move compare with double out of struct ap_fixed_base defination
+//        and combine it with compare operator(double, ap_fixed_base)
+#define DOUBLE_CMP_AF(Sym) \
+  INLINE bool operator Sym(double d) const { return to_double() Sym d; }
+
+  DOUBLE_CMP_AF(>)
+  DOUBLE_CMP_AF(<)
+  DOUBLE_CMP_AF(>=)
+  DOUBLE_CMP_AF(<=)
+  DOUBLE_CMP_AF(==)
+  DOUBLE_CMP_AF(!=)
+
+  // Bit and Slice Select
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
+      unsigned index) {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator[](
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
+                                                                index.to_int());
+  }
+
+  INLINE bool operator[](unsigned index) const {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
+  }
+
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
+      unsigned index) {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> bit(
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    _AP_WARNING(index < 0, "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this,
+                                                                index.to_int());
+  }
+
+  INLINE bool bit(unsigned index) const {
+    _AP_WARNING(index >= _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V, index);
+  }
+
+  template <int _AP_W2>
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
+      const ap_int_base<_AP_W2, true>& index) {
+    _AP_WARNING(index < _AP_I - _AP_W,
+                "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        this, index.to_int() + _AP_W - _AP_I);
+  }
+
+  INLINE bool get_bit(int index) const {
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
+                               index + _AP_W - _AP_I);
+  }
+#if 0
+  INLINE af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> get_bit(
+      int index) {
+    _AP_WARNING(index < _AP_I - _AP_W,
+              "Attempting to read bit with negative index");
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    return af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        this, index + _AP_W - _AP_I);
+  }
+#endif
+
+  template <int _AP_W2>
+  INLINE bool get_bit(const ap_int_base<_AP_W2, true>& index) const {
+    _AP_WARNING(index >= _AP_I, "Attempting to read bit beyond MSB");
+    _AP_WARNING(index < _AP_I - _AP_W, "Attempting to read bit beyond MSB");
+    return _AP_ROOT_op_get_bit(const_cast<ap_fixed_base*>(this)->V,
+                               index.to_int() + _AP_W - _AP_I);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(int Hi,
+                                                                      int Lo) {
+    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
+    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(this, Hi, Lo);
+  }
+
+  // This is a must to strip constness to produce reference type.
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      int Hi, int Lo) const {
+    _AP_WARNING((Hi >= _AP_W) || (Lo >= _AP_W), "Out of bounds in range()");
+    return af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(
+        const_cast<ap_fixed_base*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> range() const {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      int Hi, int Lo) {
+    return this->range(Hi, Lo);
+  }
+
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      int Hi, int Lo) const {
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE bool is_zero() const { return Base::V == 0; }
+
+  INLINE bool is_neg() const {
+    if (_AP_S && _AP_ROOT_op_get_bit(Base::V, _AP_W - 1)) return true;
+    return false;
+  }
+
+  INLINE int wl() const { return _AP_W; }
+
+  INLINE int iwl() const { return _AP_I; }
+
+  INLINE ap_q_mode q_mode() const { return _AP_Q; }
+
+  INLINE ap_o_mode o_mode() const { return _AP_O; }
+
+  INLINE int n_bits() const { return _AP_N; }
+
+  // print a string representation of this number in the given radix.
+  // Radix support is 2, 8, 10, or 16.
+  // The result will include a prefix indicating the radix, except for decimal,
+  // where no prefix is needed.  The default is to output a signed representation
+  // of signed numbers, or an unsigned representation  of unsigned numbers.  For
+  // non-decimal formats, this can be changed by the 'sign' argument.
+#ifndef __SYNTHESIS__
+  std::string to_string(unsigned char radix = 2, bool sign = _AP_S) const {
+    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
+    // initialize sc_lv, which seems incapable of handling format "-0b".
+    if (radix == 2) sign = false;
+
+    std::string str;
+    str.clear();
+    char step = 0;
+    bool isNeg = sign && (Base::V < 0);
+
+    // Extend to take care of the -MAX case.
+    ap_fixed_base<_AP_W + 1, _AP_I + 1> tmp(*this);
+    if (isNeg) {
+      tmp = -tmp;
+      str += '-';
+    }
+    std::string prefix;
+    switch (radix) {
+      case 2:
+        prefix = "0b";
+        step = 1;
+        break;
+      case 8:
+        prefix = "0o";
+        step = 3;
+        break;
+      case 16:
+        prefix = "0x";
+        step = 4;
+        break;
+      default:
+        break;
+    }
+
+    if (_AP_I > 0) {
+      // Note we drop the quantization and rounding flags here.  The
+      // integer part is always in range, and the fractional part we
+      // want to drop.  Also, the number is always positive, because
+      // of the absolute value above.
+      ap_int_base<AP_MAX(_AP_I + 1, 1), false> int_part;
+      //   [1] [ I ] d [ W - I ]
+      //    |     |            |
+      //    |    W-I           0
+      //    W
+      int_part.V = _AP_ROOT_op_get_range(
+          tmp.V, _AP_W - _AP_I, _AP_W);
+      str += int_part.to_string(radix, false);
+    } else {
+      str += prefix;
+      str += '0';
+    }
+
+    ap_fixed_base<AP_MAX(_AP_W - _AP_I, 1), 0, false> frac_part = tmp;
+
+    if (radix == 10) {
+      if (frac_part != 0) {
+        str += ".";
+        while (frac_part != 0) {
+          char digit = (frac_part * radix).to_char();
+          str += static_cast<char>(digit + '0');
+          frac_part *= radix;
+        }
+      }
+    } else {
+      if (frac_part != 0) {
+        str += ".";
+        for (signed i = _AP_W - _AP_I - 1; i >= 0; i -= step) {
+          char digit = frac_part.range(i, AP_MAX(0, i - step + 1)).to_char();
+          // If we have a partial bit pattern at the end, then we need
+          // to put it in the high-order bits of 'digit'.
+          int offset = AP_MIN(0, i - step + 1);
+          digit <<= -offset;
+          str += digit < 10 ? static_cast<char>(digit + '0')
+                            : static_cast<char>(digit - 10 + 'a');
+        }
+        if (radix == 16)
+          str += "p0"; // C99 Hex constants are required to have an exponent.
+      }
+    }
+    return str;
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(unsigned char radix = 2, bool sign = _AP_S) const {
+    return 0;
+  }
+#endif
+}; // struct ap_fixed_base.
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_not(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {
+  ret.V = ~op.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_and(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V & op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_or(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V | op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void b_xor(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  ret.V = op1.V ^ op2.V;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void neg(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+  ap_fixed_base<_AP_W2 + !_AP_S2, _AP_I2 + !_AP_S2, true, _AP_Q2, _AP_O2,
+                _AP_N2>
+      t;
+  t.V = -op.V;
+  ret = t;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void lshift(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
+    int i) {
+  enum {
+    F2 = _AP_W2 - _AP_I2,
+    _AP_I3 = AP_MAX(_AP_I, _AP_I2),
+    _AP_W3 = _AP_I3 + F2,
+  };
+  // wide buffer
+  ap_fixed_base<_AP_W3, _AP_I3, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
+  t.V = op.V;
+  t.V <<= i; // FIXME overflow?
+  // handle quantization and overflow
+  ret = t;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N, int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+          ap_o_mode _AP_O2, int _AP_N2>
+INLINE void rshift(
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ret,
+    const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op,
+    int i) {
+  enum {
+    F = _AP_W - _AP_I,
+    F2 = _AP_W2 - _AP_I2,
+    F3 = AP_MAX(F, F2),
+    _AP_W3 = _AP_I2 + F3,
+    sh = F - F2,
+  };
+  // wide buffer
+  ap_fixed_base<_AP_W3, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> t;
+  t.V = op.V;
+  if (sh >= 0)
+    t.V <<= (int) sh;
+  t.V >>= i;
+  // handle quantization and overflow
+  ret = t;
+}
+
+//// FIXME
+//// These partial specialization ctors allow code like
+////   char c = 'a';
+////   ap_fixed_base<8, 8, true> x(c);
+//// but what bout ap_fixed_base<9, 9, true> y(c) ?
+//
+
+#ifndef __SYNTHESIS__
+INLINE std::string scientificFormat(std::string& input) {
+  if (input.length() == 0) return input;
+
+  size_t decPosition = input.find('.');
+  if (decPosition == std::string::npos) decPosition = input.length();
+
+  size_t firstNonZeroPos = 0;
+  for (; input[firstNonZeroPos] > '9' || input[firstNonZeroPos] < '1';
+       firstNonZeroPos++)
+    ;
+
+  int exp;
+  if (firstNonZeroPos > decPosition)
+    exp = decPosition - firstNonZeroPos;
+  else
+    exp = decPosition - firstNonZeroPos - 1;
+  std::string expString = "";
+  if (exp == 0)
+    ;
+  else if (exp < 0) {
+    expString += "e-";
+    exp = -exp;
+  } else
+    expString += "e+";
+
+  if (exp < 10 && exp > 0) {
+    expString += '0';
+    expString += (char)('0' + exp);
+  } else if (exp != 0) {
+    std::string tmp;
+
+    std::ostringstream oss;
+    oss << exp;
+
+    tmp = oss.str();
+    expString += tmp;
+  }
+
+  int lastNonZeroPos = (int)(input.length() - 1);
+  for (; lastNonZeroPos >= 0; --lastNonZeroPos)
+    if (input[lastNonZeroPos] <= '9' && input[lastNonZeroPos] > '0') break;
+
+  std::string ans = "";
+  ans += input[firstNonZeroPos];
+  if (firstNonZeroPos != (size_t)lastNonZeroPos) {
+    ans += '.';
+    for (int i = firstNonZeroPos + 1; i <= lastNonZeroPos; i++)
+      if (input[i] != '.') ans += input[i];
+  }
+
+  ans += expString;
+  return ans;
+}
+
+INLINE std::string reduceToPrecision(std::string& input, int precision) {
+  bool isZero = true;
+  size_t inputLen = input.length();
+  for (size_t i = 0; i < inputLen && isZero; i++)
+    if (input[i] != '.' && input[i] != '0') isZero = false;
+  if (isZero) return "0";
+
+  // Find the first valid number, skip '-'
+  int FirstNonZeroPos = 0;
+  int LastNonZeroPos = (int)inputLen - 1;
+  int truncBitPosition = 0;
+  size_t decPosition = input.find('.');
+  for (; input[FirstNonZeroPos] < '1' || input[FirstNonZeroPos] > '9';
+       FirstNonZeroPos++)
+    ;
+
+  for (; input[LastNonZeroPos] < '1' || input[LastNonZeroPos] > '9';
+       LastNonZeroPos--)
+    ;
+
+  if (decPosition == std::string::npos) decPosition = inputLen;
+  // Count the valid number, to decide whether we need to truncate
+  if ((int)decPosition > LastNonZeroPos) {
+    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) return input;
+    truncBitPosition = FirstNonZeroPos + precision;
+  } else if ((int)decPosition < FirstNonZeroPos) { // This is pure decimal
+    if (LastNonZeroPos - FirstNonZeroPos + 1 <= precision) {
+      if (FirstNonZeroPos - decPosition - 1 < 4) {
+        return input;
+      } else {
+        if (input[0] == '-') {
+          std::string tmp = input.substr(1, inputLen - 1);
+          return std::string("-") + scientificFormat(tmp);
+        } else
+          return scientificFormat(input);
+      }
+    }
+    truncBitPosition = FirstNonZeroPos + precision;
+  } else {
+    if (LastNonZeroPos - FirstNonZeroPos <= precision) return input;
+    truncBitPosition = FirstNonZeroPos + precision + 1;
+  }
+
+  // duplicate the input string, we want to add "0" before the valid numbers
+  // This is easy for quantization, since we may change 9999 to 10000
+  std::string ans = "";
+  std::string dupInput = "0";
+  if (input[0] == '-') {
+    ans += '-';
+    dupInput += input.substr(1, inputLen - 1);
+  } else {
+    dupInput += input.substr(0, inputLen);
+    ++truncBitPosition;
+  }
+
+  // Add 'carry' after truncation, if necessary
+  bool carry = dupInput[truncBitPosition] > '4';
+  for (int i = truncBitPosition - 1; i >= 0 && carry; i--) {
+    if (dupInput[i] == '.') continue;
+    if (dupInput[i] == '9')
+      dupInput[i] = '0';
+    else {
+      ++dupInput[i];
+      carry = false;
+    }
+  }
+
+  // bits outside precision range should be set to 0
+  if (dupInput[0] == '1')
+    FirstNonZeroPos = 0;
+  else {
+    FirstNonZeroPos = 0;
+    while (dupInput[FirstNonZeroPos] < '1' || dupInput[FirstNonZeroPos] > '9')
+      ++FirstNonZeroPos;
+  }
+
+  unsigned it = FirstNonZeroPos;
+  int NValidNumber = 0;
+  while (it < dupInput.length()) {
+    if (dupInput[it] == '.') {
+      ++it;
+      continue;
+    }
+    ++NValidNumber;
+    if (NValidNumber > precision) dupInput[it] = '0';
+    ++it;
+  }
+
+  // Here we wanted to adjust the truncate position and the value
+  decPosition = dupInput.find('.');
+  if (decPosition == std::string::npos) // When this is integer
+    truncBitPosition = (int)dupInput.length();
+  else
+    for (truncBitPosition = (int)(dupInput.length() - 1); truncBitPosition >= 0;
+         --truncBitPosition) {
+      if (dupInput[truncBitPosition] == '.') break;
+      if (dupInput[truncBitPosition] != '0') {
+        truncBitPosition++;
+        break;
+      }
+    }
+
+  if (dupInput[0] == '1')
+    dupInput = dupInput.substr(0, truncBitPosition);
+  else
+    dupInput = dupInput.substr(1, truncBitPosition - 1);
+
+  decPosition = dupInput.find('.');
+  if (decPosition != std::string::npos) {
+    size_t it = 0;
+    for (it = decPosition + 1; dupInput[it] == '0'; it++)
+      ;
+    if (it - decPosition - 1 < 4) {
+      ans += dupInput;
+      return ans;
+    } else {
+      ans += scientificFormat(dupInput);
+      return ans;
+    }
+  } else if ((int)(dupInput.length()) <= precision) {
+    ans += dupInput;
+    return ans;
+  }
+
+  ans += scientificFormat(dupInput);
+  return ans;
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE void print(
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  if (_AP_I > 0) {
+    ap_int_base<_AP_I, _AP_S> p1;
+    p1.V = x.V >> (_AP_W - _AP_I);
+    print(p1.V); // print overlaod for .V should exit
+  } else {
+    printf("0");
+  }
+  printf(".");
+  if (_AP_I < _AP_W) {
+    ap_int_base<_AP_W - _AP_I, false> p2;
+    p2.V = _AP_ROOT_op_get_range(x.V, 0, _AP_W - _AP_I);
+    print(p2.V, false); // print overlaod for .V should exit
+  }
+}
+#endif // ifndef __SYNTHESIS__
+
+// XXX the following two functions have to exist in synthesis,
+// as some old HLS Video Library code uses the ostream overload,
+// although HLS will later delete I/O function call.
+
+/// Output streaming
+//-----------------------------------------------------------------------------
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& out,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  // TODO support std::ios_base::fmtflags
+  unsigned width = out.width();
+  unsigned precision = out.precision();
+  char fill = out.fill();
+  std::string str = x.to_string(10, _AP_S);
+  str = reduceToPrecision(str, precision);
+  if (width > str.length()) {
+    for (unsigned i = 0; i < width - str.length(); ++i)
+      out << fill;
+  }
+  out << str;
+  return out;
+}
+#endif // ifndef __SYNTHESIS__
+
+/// Input streaming
+// -----------------------------------------------------------------------------
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::istream& operator>>(
+    std::istream& in,
+    ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  double d;
+  in >> d;
+  x = ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>(d);
+  return in;
+}
+#endif
+#endif // ifndef AP_AUTOCC
+
+/// Operators mixing Integers with ap_fixed_base
+// -----------------------------------------------------------------------------
+#define AF_BIN_OP_WITH_INT_SF(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)     \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,           \
+            ap_o_mode _AP_O, int _AP_N>                                  \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<    \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                     \
+  operator BIN_OP(                                                       \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op, \
+      C_TYPE i_op) {                                                     \
+    return op.operator BIN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op));        \
+  }
+
+#define AF_BIN_OP_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)           \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      C_TYPE i_op) {                                                        \
+    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      C_TYPE i_op,                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
+  }
+
+#define AF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE bool operator REL_OP(                                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      C_TYPE i_op) {                                                        \
+    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE bool operator REL_OP(                                              \
+      C_TYPE i_op,                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
+  }
+
+#define AF_ASSIGN_OP_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)               \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N>                                        \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
+  operator ASSIGN_OP(                                                          \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
+      C_TYPE i_op) {                                                           \
+    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define AF_ASSIGN_OP_WITH_INT_SF(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)  \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,       \
+            ap_o_mode _AP_O, int _AP_N>                              \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&    \
+  operator ASSIGN_OP(                                                \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      C_TYPE i_op) {                                                 \
+    return op.operator ASSIGN_OP(ap_int_base<_AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define ALL_AF_OP_WITH_INT(C_TYPE, BITS, SIGN)               \
+  AF_BIN_OP_WITH_INT(+, C_TYPE, (BITS), (SIGN), plus)     \
+  AF_BIN_OP_WITH_INT(-, C_TYPE, (BITS), (SIGN), minus)    \
+  AF_BIN_OP_WITH_INT(*, C_TYPE, (BITS), (SIGN), mult)     \
+  AF_BIN_OP_WITH_INT(/, C_TYPE, (BITS), (SIGN), div)      \
+  AF_BIN_OP_WITH_INT(&, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT(|, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT(^, C_TYPE, (BITS), (SIGN), logic)    \
+  AF_BIN_OP_WITH_INT_SF(>>, C_TYPE, (BITS), (SIGN), lhs)  \
+  AF_BIN_OP_WITH_INT_SF(<<, C_TYPE, (BITS), (SIGN), lhs)  \
+                                                          \
+  AF_ASSIGN_OP_WITH_INT(+=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(-=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(*=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(/=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(&=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(|=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT(^=, C_TYPE, (BITS), (SIGN))     \
+  AF_ASSIGN_OP_WITH_INT_SF(>>=, C_TYPE, (BITS), (SIGN)) \
+  AF_ASSIGN_OP_WITH_INT_SF(<<=, C_TYPE, (BITS), (SIGN)) \
+                                                          \
+  AF_REL_OP_WITH_INT(>, C_TYPE, (BITS), (SIGN))           \
+  AF_REL_OP_WITH_INT(<, C_TYPE, (BITS), (SIGN))           \
+  AF_REL_OP_WITH_INT(>=, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(<=, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(==, C_TYPE, (BITS), (SIGN))          \
+  AF_REL_OP_WITH_INT(!=, C_TYPE, (BITS), (SIGN))
+
+ALL_AF_OP_WITH_INT(bool, 1, false)
+ALL_AF_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_AF_OP_WITH_INT(signed char, 8, true)
+ALL_AF_OP_WITH_INT(unsigned char, 8, false)
+ALL_AF_OP_WITH_INT(short, _AP_SIZE_short, true)
+ALL_AF_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_AF_OP_WITH_INT(int, _AP_SIZE_int, true)
+ALL_AF_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_AF_OP_WITH_INT(long, _AP_SIZE_long, true)
+ALL_AF_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_AF_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_AF_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ALL_AF_OP_WITH_INT
+#undef AF_BIN_OP_WITH_INT
+#undef AF_BIN_OP_WITH_INT_SF
+#undef AF_ASSIGN_OP_WITH_INT
+#undef AF_ASSIGN_OP_WITH_INT_SF
+#undef AF_REL_OP_WITH_INT
+
+/*
+ * **********************************************************************
+ * TODO
+ * There is no operator defined with float/double/long double, so that
+ * code like
+ *   ap_fixed<8,4> a = 1.5f;
+ *   a += 0.5f;
+ * will fail in compilation.
+ * Operator with warning about conversion might be wanted.
+ * **********************************************************************
+ */
+
+#define AF_BIN_OP_WITH_AP_INT(BIN_OP, RTYPE)                                \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE typename ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>::template RType<    \
+      _AP_W, _AP_I, _AP_S>::RTYPE                                           \
+  operator BIN_OP(                                                          \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator BIN_OP(op); \
+  }                                                                         \
+                                                                            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE typename ap_fixed_base<_AP_W, _AP_I, _AP_S>::template RType<       \
+      _AP_W2, _AP_W2, _AP_S2>::RTYPE                                        \
+  operator BIN_OP(                                                          \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
+    return op.operator BIN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }
+
+#define AF_REL_OP_WITH_AP_INT(REL_OP)                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE bool operator REL_OP(                                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,    \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                            \
+    return op.operator REL_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                         \
+                                                                            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>            \
+  INLINE bool operator REL_OP(                                              \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op,                              \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {  \
+    return ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op).operator REL_OP(op); \
+  }
+
+#define AF_ASSIGN_OP_WITH_AP_INT(ASSIGN_OP)                                    \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
+  INLINE ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>&              \
+  operator ASSIGN_OP(                                                          \
+      ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,             \
+      const ap_int_base<_AP_W2, _AP_S2>& i_op) {                               \
+    return op.operator ASSIGN_OP(ap_fixed_base<_AP_W2, _AP_W2, _AP_S2>(i_op)); \
+  }                                                                            \
+                                                                               \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,                 \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>               \
+  INLINE ap_int_base<_AP_W2, _AP_S2>& operator ASSIGN_OP(                      \
+      ap_int_base<_AP_W2, _AP_S2>& i_op,                                       \
+      const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {     \
+    return i_op.operator ASSIGN_OP(op.to_ap_int_base());                       \
+  }
+
+AF_BIN_OP_WITH_AP_INT(+, plus)
+AF_BIN_OP_WITH_AP_INT(-, minus)
+AF_BIN_OP_WITH_AP_INT(*, mult)
+AF_BIN_OP_WITH_AP_INT(/, div)
+AF_BIN_OP_WITH_AP_INT(&, logic)
+AF_BIN_OP_WITH_AP_INT(|, logic)
+AF_BIN_OP_WITH_AP_INT(^, logic)
+
+#undef AF_BIN_OP_WITH_AP_INT
+
+AF_ASSIGN_OP_WITH_AP_INT(+=)
+AF_ASSIGN_OP_WITH_AP_INT(-=)
+AF_ASSIGN_OP_WITH_AP_INT(*=)
+AF_ASSIGN_OP_WITH_AP_INT(/=)
+AF_ASSIGN_OP_WITH_AP_INT(&=)
+AF_ASSIGN_OP_WITH_AP_INT(|=)
+AF_ASSIGN_OP_WITH_AP_INT(^=)
+
+#undef AF_ASSIGN_OP_WITH_AP_INT
+
+AF_REL_OP_WITH_AP_INT(==)
+AF_REL_OP_WITH_AP_INT(!=)
+AF_REL_OP_WITH_AP_INT(>)
+AF_REL_OP_WITH_AP_INT(>=)
+AF_REL_OP_WITH_AP_INT(<)
+AF_REL_OP_WITH_AP_INT(<=)
+
+#undef AF_REL_OP_WITH_AP_INT
+
+// Relational Operators with double
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator==(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator==(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator!=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator!=(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator>(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator<(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator>=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator<=(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator<(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator>(op1);
+}
+
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE bool operator<=(
+    double op1,
+    const ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op2) {
+  return op2.operator>=(op1);
+}
+
+#endif // ifndef __cplusplus else
+
+#endif // ifndef __AP_FIXED_BASE_H__ else
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h b/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h
index a1c2816c79..aefda0a676 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed_ref.h
@@ -1,718 +1,718 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_REF_H__
-#define __AP_FIXED_REF_H__
-
-#ifndef __AP_FIXED_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-
-#else
-#ifndef __SYNTHESIS__
-#include <iostream>
-#endif
-/// Proxy class, which allows bit selection  to be used as both rvalue (for
-/// reading) and lvalue (for writing)
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_bit_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
-  ref_type& d_bv;
-  int d_index;
-
- public:
-  INLINE af_bit_ref(
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
-      : d_bv(ref.d_bv), d_index(ref.d_index) {
-#ifndef __SYNTHESIS__
-    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.",
-                d_index);
-    _AP_WARNING(d_index >= _AP_W, "Index of bit vector (%d) out of range (%d).",
-                d_index, _AP_W);
-#endif
-  }
-
-  INLINE af_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
-
-  INLINE af_bit_ref(const ref_type* bv, int index = 0)
-      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
-
-  /// convert operators.
-  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  /// @name assign operators
-  //  @{
-  INLINE af_bit_ref& operator=(bool val) {
-    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
-    return *this;
-  }
-
-  // Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE af_bit_ref& operator=(const af_bit_ref& val) {
-    return operator=(bool(val));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_bit_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(bool(val));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=(bool(val));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
-    return operator=(val != 0);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    return operator=(ap_int_base<_AP_W2, false>(val));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_bit_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(ap_int_base<_AP_W2, false>(val));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE af_bit_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    return operator=(ap_int_base<_AP_W2 + _AP_W3, false>(val));
-  }
-  //  @}
-
-  /// @name concatenate operators
-  //  @{
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, op);
-  }
-
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
-      const ap_bit_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(*this,
-                                                                        op);
-  }
-
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, op);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
-    return ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-                                                                         op);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      1, af_bit_ref, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
-    return ap_concat_ref<
-        1, af_bit_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-                                                                       op);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                    _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
-    return ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                      _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            op));
-  }
-  //  @}
-
-  /// @name comparison
-  //  @{
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator==(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    return get() == op.get();
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator!=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    return get() != op.get();
-  }
-  //  @}
-
-  INLINE bool operator~() const {
-    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
-    return bit ? false : true;
-  }
-
-  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  INLINE int length() const { return 1; }
-
-#ifndef __SYNTHESIS__
-  std::string to_string() const { return get() ? "1" : "0"; }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string() const { return 0; }
-#endif
-}; // struct af_bit_ref
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::ostream& operator<<(
-    std::ostream& os,
-    const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  os << x.to_string();
-  return os;
-}
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_AUTOCC
-
-/// Range (slice) reference.
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-struct af_range_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
-  ref_type& d_bv;
-  int l_index;
-  int h_index;
-
- public:
-  /// copy ctor
-  INLINE af_range_ref(
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
-      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
-
-  /// ctor from ap_fixed_base, higher and lower bound.
-  /** if h is less than l, the bits selected will be returned in reverse order.
-   */
-  INLINE af_range_ref(ref_type* bv, int h, int l)
-      : d_bv(*bv), l_index(l), h_index(h) {
-#ifndef __SYNTHESIS__
-    _AP_WARNING(h < 0 || l < 0,
-                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
-                l);
-    _AP_WARNING(h >= _AP_W || l >= _AP_W,
-                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
-    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
-#endif
-  }
-
-  INLINE af_range_ref(const ref_type* bv, int h, int l)
-      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {
-#ifndef __SYNTHESIS__
-    _AP_WARNING(h < 0 || l < 0,
-                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
-                l);
-    _AP_WARNING(h >= _AP_W || l >= _AP_W,
-                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
-    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
-#endif
-  }
-
-  /// @name assign operators
-  //  @{
-
-#define ASSIGN_CTYPE_TO_AF_RANGE(DATA_TYPE)                          \
-  INLINE af_range_ref& operator=(const DATA_TYPE val) {              \
-    ap_int_base<_AP_W, false> loc(val);                              \
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, loc.V); \
-    return *this;                                                    \
-  }
-
-  ASSIGN_CTYPE_TO_AF_RANGE(bool)
-  ASSIGN_CTYPE_TO_AF_RANGE(char)
-  ASSIGN_CTYPE_TO_AF_RANGE(signed char)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned char)
-  ASSIGN_CTYPE_TO_AF_RANGE(short)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned short)
-  ASSIGN_CTYPE_TO_AF_RANGE(int)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned int)
-  ASSIGN_CTYPE_TO_AF_RANGE(long)
-  ASSIGN_CTYPE_TO_AF_RANGE(unsigned long)
-  ASSIGN_CTYPE_TO_AF_RANGE(ap_slong)
-  ASSIGN_CTYPE_TO_AF_RANGE(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_CTYPE_TO_AF_RANGE(half)
-#endif
-  ASSIGN_CTYPE_TO_AF_RANGE(float)
-  ASSIGN_CTYPE_TO_AF_RANGE(double)
-#undef ASSIGN_CTYPE_TO_AF_RANGE
-
-  /// assgin using a string. XXX crucial for cosim.
-  INLINE af_range_ref& operator=(const char* val) {
-    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-    return *this;
-  }
-
-  /// assign from ap_int_base.
-  // NOTE Base of other assgin operators.
-  template <int _AP_W3, bool _AP_S3>
-  INLINE af_range_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-    return *this;
-  }
-
-  /// assign from range reference to ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    const ap_int_base<_AP_W2, false> tmp(val);
-    return operator=(tmp);
-  }
-
-  /// assign from bit reference to ap_int_base..
-  template <int _AP_W2, bool _AP_S2>
-  INLINE af_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    const ap_int_base<1, false> tmp((bool)val);
-    return operator=(tmp);
-  }
-
-  /// assgin from ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_range_ref& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-    return *this;
-  }
-
-  /// copy assgin.
-  // XXX This has to be explicit, otherwise it will be deleted, as d_bv is
-  // of reference type.
-  INLINE af_range_ref& operator=(const af_range_ref& val) {
-    ap_int_base<_AP_W, false> tmp(val);
-    return operator=(tmp);
-  }
-
-  /// assign from range reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_range_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    ap_int_base<_AP_W2, false> tmp(val);
-    return operator=(tmp);
-  }
-
-  /// assign from bit reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE af_range_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    ap_int_base<1, false> tmp((bool)val);
-    return operator=(tmp);
-  }
-
-  /// assign from compound reference.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE af_range_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    const ap_int_base<_AP_W2 + _AP_W3, false> tmp(val);
-    return operator=(tmp);
-  }
-  //  @}
-
-  /// @name comparison operators with ap_range_ref.
-  //  @{
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop == rop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator==(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop < rop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop > rop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator>(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator<(op2));
-  }
-  //  @}
-
-  /// @name comparison operators with af_range_ref.
-  //  @{
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator==(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop == rop;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator!=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    return !(operator==(op2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator<(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop < rop;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator>(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> rop(op2);
-    return lop > rop;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator<=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    return !(operator>(op2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE bool operator>=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
-    return !(operator<(op2));
-  }
-  //  @}
-
-  /// @name concatenate operators.
-  /// @{
-  /// concatenate with ap_int_base.
-  template <int _AP_W2, int _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, op);
-  }
-
-  /// concatenate with ap_bit_ref.
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(op));
-  }
-
-  /// concatenate with ap_bit_ref.
-  template <int _AP_W2, int _AP_S2>
-  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(op));
-  }
-
-  /// concatenate with ap_concat_ref.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
-    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(op));
-  }
-
-  /// concatenate with another af_range_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
-                    af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                    &op) {
-    return ap_concat_ref<
-        _AP_W, af_range_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            op));
-  }
-
-  /// concatenate with another af_bit_ref.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, af_range_ref, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(
-          const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
-    return ap_concat_ref<
-        _AP_W, af_range_ref, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            op));
-  }
-  //  @}
-
-  INLINE operator ap_ulong() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret.to_uint64();
-  }
-
-  INLINE operator ap_int_base<_AP_W, false>() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  INLINE ap_int_base<_AP_W, false> to_ap_int_base() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  // used in ap_fixed_base::to_string()
-  INLINE char to_char() const {
-    return (char)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE int to_int() const {
-    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned to_uint() const {
-    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE long to_long() const {
-    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned long to_ulong() const {
-    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_slong to_int64() const {
-    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_ulong to_uint64() const {
-    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_int_base<_AP_W, false> get() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  template <int _AP_W2>
-  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-  }
-
-  INLINE int length() const {
-    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
-  }
-
-#ifndef __SYNTHESIS__
-  std::string to_string(signed char rd = 2) const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret.to_string(rd);
-  }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string(signed char rd = 2) const {
-    return 0;
-  }
-#endif
-}; // struct af_range_ref
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
-          int _AP_N>
-INLINE std::ostream& operator<<(
-    std::ostream& os,
-    const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
-  os << x.to_string();
-  return os;
-}
-#endif
-#endif // ifndef AP_AUTOCC
-
-#define AF_REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)            \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
-      C_TYPE op2) {                                                       \
-    return ap_int_base<_AP_W, false>(op)                                  \
-        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                          \
-  }                                                                       \
-                                                                          \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      C_TYPE op2,                                                         \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
-    return ap_int_base<_AP_W2, _AP_S2>(op2)                               \
-        REL_OP ap_int_base<_AP_W, false>(op);                             \
-  }                                                                       \
-                                                                          \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
-      C_TYPE op2) {                                                       \
-    return bool(op) REL_OP op2;                                           \
-  }                                                                       \
-                                                                          \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N>                                   \
-  INLINE bool operator REL_OP(                                            \
-      C_TYPE op2,                                                         \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
-    return op2 REL_OP bool(op);                                           \
-  }
-
-#define AF_REF_REL_OPS_WITH_INT(C_TYPE, _AP_W2, _AP_S2)  \
-  AF_REF_REL_OP_WITH_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
-  AF_REF_REL_OP_WITH_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
-  AF_REF_REL_OP_WITH_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  AF_REF_REL_OP_WITH_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  AF_REF_REL_OP_WITH_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
-  AF_REF_REL_OP_WITH_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
-
-AF_REF_REL_OPS_WITH_INT(bool, 1, false)
-AF_REF_REL_OPS_WITH_INT(char, 8, CHAR_IS_SIGNED)
-AF_REF_REL_OPS_WITH_INT(signed char, 8, true)
-AF_REF_REL_OPS_WITH_INT(unsigned char, 8, false)
-AF_REF_REL_OPS_WITH_INT(short, _AP_SIZE_short, true)
-AF_REF_REL_OPS_WITH_INT(unsigned short, _AP_SIZE_short, false)
-AF_REF_REL_OPS_WITH_INT(int, _AP_SIZE_int, true)
-AF_REF_REL_OPS_WITH_INT(unsigned int, _AP_SIZE_int, false)
-AF_REF_REL_OPS_WITH_INT(long, _AP_SIZE_long, true)
-AF_REF_REL_OPS_WITH_INT(unsigned long, _AP_SIZE_long, false)
-AF_REF_REL_OPS_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-AF_REF_REL_OPS_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef AF_REF_REL_OP_INT
-#undef AF_REF_REL_OPS_WITH_INT
-
-#define AF_REF_REL_OP_WITH_AP_INT(REL_OP)                                 \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
-      const ap_int_base<_AP_W2, _AP_S>& op2) {                            \
-    return ap_int_base<_AP_W, false>(op) REL_OP op2;                      \
-  }                                                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
-    return op2 REL_OP ap_int_base<_AP_W, false>(op);                      \
-  }                                                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {                           \
-    return ap_int_base<1, false>(op) REL_OP op2;                          \
-  }                                                                       \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
-            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
-  INLINE bool operator REL_OP(                                            \
-      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
-    return op2 REL_OP ap_int_base<1, false>(op);                          \
-  }
-
-AF_REF_REL_OP_WITH_AP_INT(>)
-AF_REF_REL_OP_WITH_AP_INT(<)
-AF_REF_REL_OP_WITH_AP_INT(>=)
-AF_REF_REL_OP_WITH_AP_INT(<=)
-AF_REF_REL_OP_WITH_AP_INT(==)
-AF_REF_REL_OP_WITH_AP_INT(!=)
-
-#endif // ifndef __cplusplus
-
-#endif // ifndef __AP_FIXED_REF_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_REF_H__
+#define __AP_FIXED_REF_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+
+#else
+#ifndef __SYNTHESIS__
+#include <iostream>
+#endif
+/// Proxy class, which allows bit selection  to be used as both rvalue (for
+/// reading) and lvalue (for writing)
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_bit_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
+  ref_type& d_bv;
+  int d_index;
+
+ public:
+  INLINE af_bit_ref(
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.",
+                d_index);
+    _AP_WARNING(d_index >= _AP_W, "Index of bit vector (%d) out of range (%d).",
+                d_index, _AP_W);
+#endif
+  }
+
+  INLINE af_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
+
+  INLINE af_bit_ref(const ref_type* bv, int index = 0)
+      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
+
+  /// convert operators.
+  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  /// @name assign operators
+  //  @{
+  INLINE af_bit_ref& operator=(bool val) {
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
+    return *this;
+  }
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE af_bit_ref& operator=(const af_bit_ref& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_bit_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=(bool(val));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    return operator=(val != 0);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=(ap_int_base<_AP_W2, false>(val));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_bit_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(ap_int_base<_AP_W2, false>(val));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE af_bit_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=(ap_int_base<_AP_W2 + _AP_W3, false>(val));
+  }
+  //  @}
+
+  /// @name concatenate operators
+  //  @{
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, op);
+  }
+
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
+      const ap_bit_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(*this,
+                                                                        op);
+  }
+
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, op);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
+    return ap_concat_ref<1, af_bit_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+                                                                         op);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      1, af_bit_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<
+        1, af_bit_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+                                                                       op);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                    _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<1, af_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                      _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+  //  @}
+
+  /// @name comparison
+  //  @{
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator==(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator!=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    return get() != op.get();
+  }
+  //  @}
+
+  INLINE bool operator~() const {
+    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
+    return bit ? false : true;
+  }
+
+  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  INLINE int length() const { return 1; }
+
+#ifndef __SYNTHESIS__
+  std::string to_string() const { return get() ? "1" : "0"; }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string() const { return 0; }
+#endif
+}; // struct af_bit_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& os,
+    const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  os << x.to_string();
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/// Range (slice) reference.
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+struct af_range_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  typedef ap_fixed_base<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> ref_type;
+  ref_type& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  /// copy ctor
+  INLINE af_range_ref(
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  /// ctor from ap_fixed_base, higher and lower bound.
+  /** if h is less than l, the bits selected will be returned in reverse order.
+   */
+  INLINE af_range_ref(ref_type* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
+                l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
+    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
+#endif
+  }
+
+  INLINE af_range_ref(const ref_type* bv, int h, int l)
+      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {
+#ifndef __SYNTHESIS__
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound(%d) and lower(%d) bound cannot be negative.", h,
+                l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound(%d) or lower(%d) bound out of range.", h, l);
+    _AP_WARNING(h < l, "The bits selected will be returned in reverse order.");
+#endif
+  }
+
+  /// @name assign operators
+  //  @{
+
+#define ASSIGN_CTYPE_TO_AF_RANGE(DATA_TYPE)                          \
+  INLINE af_range_ref& operator=(const DATA_TYPE val) {              \
+    ap_int_base<_AP_W, false> loc(val);                              \
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, loc.V); \
+    return *this;                                                    \
+  }
+
+  ASSIGN_CTYPE_TO_AF_RANGE(bool)
+  ASSIGN_CTYPE_TO_AF_RANGE(char)
+  ASSIGN_CTYPE_TO_AF_RANGE(signed char)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned char)
+  ASSIGN_CTYPE_TO_AF_RANGE(short)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned short)
+  ASSIGN_CTYPE_TO_AF_RANGE(int)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned int)
+  ASSIGN_CTYPE_TO_AF_RANGE(long)
+  ASSIGN_CTYPE_TO_AF_RANGE(unsigned long)
+  ASSIGN_CTYPE_TO_AF_RANGE(ap_slong)
+  ASSIGN_CTYPE_TO_AF_RANGE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_CTYPE_TO_AF_RANGE(half)
+#endif
+  ASSIGN_CTYPE_TO_AF_RANGE(float)
+  ASSIGN_CTYPE_TO_AF_RANGE(double)
+#undef ASSIGN_CTYPE_TO_AF_RANGE
+
+  /// assgin using a string. XXX crucial for cosim.
+  INLINE af_range_ref& operator=(const char* val) {
+    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// assign from ap_int_base.
+  // NOTE Base of other assgin operators.
+  template <int _AP_W3, bool _AP_S3>
+  INLINE af_range_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+    return *this;
+  }
+
+  /// assign from range reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    const ap_int_base<_AP_W2, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from bit reference to ap_int_base..
+  template <int _AP_W2, bool _AP_S2>
+  INLINE af_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    const ap_int_base<1, false> tmp((bool)val);
+    return operator=(tmp);
+  }
+
+  /// assgin from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+    return *this;
+  }
+
+  /// copy assgin.
+  // XXX This has to be explicit, otherwise it will be deleted, as d_bv is
+  // of reference type.
+  INLINE af_range_ref& operator=(const af_range_ref& val) {
+    ap_int_base<_AP_W, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from range reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    ap_int_base<_AP_W2, false> tmp(val);
+    return operator=(tmp);
+  }
+
+  /// assign from bit reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE af_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    ap_int_base<1, false> tmp((bool)val);
+    return operator=(tmp);
+  }
+
+  /// assign from compound reference.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE af_range_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    const ap_int_base<_AP_W2 + _AP_W3, false> tmp(val);
+    return operator=(tmp);
+  }
+  //  @}
+
+  /// @name comparison operators with ap_range_ref.
+  //  @{
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop == rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop < rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop > rop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator>(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<(op2));
+  }
+  //  @}
+
+  /// @name comparison operators with af_range_ref.
+  //  @{
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator==(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop == rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator!=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator<(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop < rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator>(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> rop(op2);
+    return lop > rop;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator<=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator>(op2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE bool operator>=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op2) {
+    return !(operator<(op2));
+  }
+  //  @}
+
+  /// @name concatenate operators.
+  /// @{
+  /// concatenate with ap_int_base.
+  template <int _AP_W2, int _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(ap_int_base<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, op);
+  }
+
+  /// concatenate with ap_bit_ref.
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(op));
+  }
+
+  /// concatenate with ap_bit_ref.
+  template <int _AP_W2, int _AP_S2>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(op));
+  }
+
+  /// concatenate with ap_concat_ref.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &op) {
+    return ap_concat_ref<_AP_W, af_range_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(op));
+  }
+
+  /// concatenate with another af_range_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, _AP_W2,
+                    af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &op) {
+    return ap_concat_ref<
+        _AP_W, af_range_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+
+  /// concatenate with another af_bit_ref.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, af_range_ref, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(
+          const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &op) {
+    return ap_concat_ref<
+        _AP_W, af_range_ref, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            op));
+  }
+  //  @}
+
+  INLINE operator ap_ulong() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_uint64();
+  }
+
+  INLINE operator ap_int_base<_AP_W, false>() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  INLINE ap_int_base<_AP_W, false> to_ap_int_base() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  // used in ap_fixed_base::to_string()
+  INLINE char to_char() const {
+    return (char)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE int to_int() const {
+    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned to_uint() const {
+    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE long to_long() const {
+    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned long to_ulong() const {
+    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_slong to_int64() const {
+    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_int_base<_AP_W, false> get() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char rd = 2) const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_string(rd);
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(signed char rd = 2) const {
+    return 0;
+  }
+#endif
+}; // struct af_range_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q, ap_o_mode _AP_O,
+          int _AP_N>
+INLINE std::ostream& operator<<(
+    std::ostream& os,
+    const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& x) {
+  os << x.to_string();
+  return os;
+}
+#endif
+#endif // ifndef AP_AUTOCC
+
+#define AF_REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)            \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      C_TYPE op2) {                                                       \
+    return ap_int_base<_AP_W, false>(op)                                  \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                          \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      C_TYPE op2,                                                         \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
+    return ap_int_base<_AP_W2, _AP_S2>(op2)                               \
+        REL_OP ap_int_base<_AP_W, false>(op);                             \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
+      C_TYPE op2) {                                                       \
+    return bool(op) REL_OP op2;                                           \
+  }                                                                       \
+                                                                          \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N>                                   \
+  INLINE bool operator REL_OP(                                            \
+      C_TYPE op2,                                                         \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
+    return op2 REL_OP bool(op);                                           \
+  }
+
+#define AF_REF_REL_OPS_WITH_INT(C_TYPE, _AP_W2, _AP_S2)  \
+  AF_REF_REL_OP_WITH_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
+  AF_REF_REL_OP_WITH_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
+  AF_REF_REL_OP_WITH_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
+  AF_REF_REL_OP_WITH_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+AF_REF_REL_OPS_WITH_INT(bool, 1, false)
+AF_REF_REL_OPS_WITH_INT(char, 8, CHAR_IS_SIGNED)
+AF_REF_REL_OPS_WITH_INT(signed char, 8, true)
+AF_REF_REL_OPS_WITH_INT(unsigned char, 8, false)
+AF_REF_REL_OPS_WITH_INT(short, _AP_SIZE_short, true)
+AF_REF_REL_OPS_WITH_INT(unsigned short, _AP_SIZE_short, false)
+AF_REF_REL_OPS_WITH_INT(int, _AP_SIZE_int, true)
+AF_REF_REL_OPS_WITH_INT(unsigned int, _AP_SIZE_int, false)
+AF_REF_REL_OPS_WITH_INT(long, _AP_SIZE_long, true)
+AF_REF_REL_OPS_WITH_INT(unsigned long, _AP_SIZE_long, false)
+AF_REF_REL_OPS_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+AF_REF_REL_OPS_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef AF_REF_REL_OP_INT
+#undef AF_REF_REL_OPS_WITH_INT
+
+#define AF_REF_REL_OP_WITH_AP_INT(REL_OP)                                 \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,   \
+      const ap_int_base<_AP_W2, _AP_S>& op2) {                            \
+    return ap_int_base<_AP_W, false>(op) REL_OP op2;                      \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) { \
+    return op2 REL_OP ap_int_base<_AP_W, false>(op);                      \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op,     \
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {                           \
+    return ap_int_base<1, false>(op) REL_OP op2;                          \
+  }                                                                       \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,            \
+            ap_o_mode _AP_O, int _AP_N, int _AP_W2, bool _AP_S2>          \
+  INLINE bool operator REL_OP(                                            \
+      const ap_int_base<_AP_W2, _AP_S2>& op2,                             \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N>& op) {   \
+    return op2 REL_OP ap_int_base<1, false>(op);                          \
+  }
+
+AF_REF_REL_OP_WITH_AP_INT(>)
+AF_REF_REL_OP_WITH_AP_INT(<)
+AF_REF_REL_OP_WITH_AP_INT(>=)
+AF_REF_REL_OP_WITH_AP_INT(<=)
+AF_REF_REL_OP_WITH_AP_INT(==)
+AF_REF_REL_OP_WITH_AP_INT(!=)
+
+#endif // ifndef __cplusplus
+
+#endif // ifndef __AP_FIXED_REF_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_fixed_special.h b/hls4ml/templates/vivado/ap_types/ap_fixed_special.h
index 5c09f247bd..0f7a9f7eb3 100644
--- a/hls4ml/templates/vivado/ap_types/ap_fixed_special.h
+++ b/hls4ml/templates/vivado/ap_types/ap_fixed_special.h
@@ -1,230 +1,230 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_FIXED_SPECIAL_H__
-#define __AP_FIXED_SPECIAL_H__
-
-#ifndef __AP_FIXED_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __SYNTHESIS__
-#include <cstdio>
-#include <cstdlib>
-#endif
-// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
-// include.
-// #include <complex>
-namespace std {
-template<typename _Tp> class complex;
-}
-
-/*
-  TODO: Modernize the code using C++11/C++14
-  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
-  2. move constructor
-*/
-
-namespace std {
-/*
-   Specialize std::complex<ap_fixed> to zero initialization ap_fixed.
-
-   To reduce the area cost, ap_fixed is not zero initialized, just like basic
-   types float or double. However, libstdc++ provides specialization for float,
-   double and long double, initializing image part to 0 when not specified.
-
-   This has become a difficulty in switching legacy code from these C types to
-   ap_fixed. To ease the tranform of legacy code, we have to implement
-   specialization of std::complex<> for our type.
-
-   As ap_fixed is a template, it is impossible to specialize only the methods
-   that causes default initialization of value type in std::complex<>. An
-   explicit full specialization of the template class has to be done, covering
-   all the member functions and operators of std::complex<> as specified
-   in standard 26.2.4 and 26.2.5.
-*/
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > {
- public:
-  typedef ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp;
-  typedef _Tp value_type;
-
-  // 26.2.4/1
-  // Constructor without argument
-  // Default initialize, so that in dataflow, the variable is only written once.
-  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
-  // Constructor with ap_fixed.
-  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
-  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
-      : _M_real(__r), _M_imag(__i) {}
-
-  // Constructor with another complex number
-  template <typename _Up>
-  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
-
-#if __cplusplus >= 201103L
-  const _Tp& real() const { return _M_real; }
-  const _Tp& imag() const { return _M_imag; }
-#else
-  _Tp& real() { return _M_real; }
-  const _Tp& real() const { return _M_real; }
-  _Tp& imag() { return _M_imag; }
-  const _Tp& imag() const { return _M_imag; }
-#endif
- 
-  void real(_Tp __val) { _M_real = __val; }
-
-  void imag(_Tp __val) { _M_imag = __val; }
-
-  // Assign this complex number with ap_fixed.
-  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
-  complex<_Tp> &operator=(const _Tp __t) {
-    _M_real = __t;
-    _M_imag = _Tp(0);
-    return *this;
-  }
-
-  // 26.2.5/1
-  // Add ap_fixed to this complex number.
-  complex<_Tp> &operator+=(const _Tp &__t) {
-    _M_real += __t;
-    return *this;
-  }
-
-  // 26.2.5/3
-  // Subtract ap_fixed from this complex number.
-  complex<_Tp> &operator-=(const _Tp &__t) {
-    _M_real -= __t;
-    return *this;
-  }
-
-  // 26.2.5/5
-  // Multiply this complex number by ap_fixed.
-  complex<_Tp> &operator*=(const _Tp &__t) {
-    _M_real *= __t;
-    _M_imag *= __t;
-    return *this;
-  }
-
-  // 26.2.5/7
-  // Divide this complex number by ap_fixed.
-  complex<_Tp> &operator/=(const _Tp &__t) {
-    _M_real /= __t;
-    _M_imag /= __t;
-    return *this;
-  }
-
-  // Assign complex number to this complex number.
-  template <typename _Up>
-  complex<_Tp> &operator=(const complex<_Up> &__z) {
-    _M_real = __z.real();
-    _M_imag = __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/9
-  // Add complex number to this.
-  template <typename _Up>
-  complex<_Tp> &operator+=(const complex<_Up> &__z) {
-    _M_real += __z.real();
-    _M_imag += __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/11
-  // Subtract complex number from this.
-  template <typename _Up>
-  complex<_Tp> &operator-=(const complex<_Up> &__z) {
-    _M_real -= __z.real();
-    _M_imag -= __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/13
-  // Multiply this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator*=(const complex<_Up> &__z) {
-    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
-    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
-    _M_real = __r;
-    return *this;
-  }
-
-  // 26.2.5/15
-  // Divide this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator/=(const complex<_Up> &__z) {
-    complex<_Tp> cj (__z.real(), -__z.imag());
-    complex<_Tp> a = (*this) * cj;
-    complex<_Tp> b = cj * __z;
-    _M_real = a.real() / b.real();
-    _M_imag = a.imag() / b.real();
-    return *this;
-  }
-
- private:
-  _Tp _M_real;
-  _Tp _M_imag;
-
-}; // class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> >
-
-/*
-   Non-member operations
-   These operations are not required by standard in 26.2.6, but libstdc++
-   defines them for
-   float, double or long double's specialization.
-*/
-// Compare complex number with ap_fixed.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator==(
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
-  return __x.real() == __y &&
-         __x.imag() == 0;
-}
-
-// Compare ap_fixed with complex number.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator==(
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
-  return __x == __y.real() &&
-         0 == __y.imag();
-}
-
-// Compare complex number with ap_fixed.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator!=(
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
-  return __x.real() != __y ||
-         __x.imag() != 0;
-}
-
-// Compare ap_fixed with complex number.
-template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
-inline bool operator!=(
-    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
-    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
-  return __x != __y.real() ||
-         0 != __y.imag();
-}
-
-}  // namespace std
-
-#endif  // ifndef __AP_FIXED_SPECIAL_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_FIXED_SPECIAL_H__
+#define __AP_FIXED_SPECIAL_H__
+
+#ifndef __AP_FIXED_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __SYNTHESIS__
+#include <cstdio>
+#include <cstdlib>
+#endif
+// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
+// include.
+// #include <complex>
+namespace std {
+template<typename _Tp> class complex;
+}
+
+/*
+  TODO: Modernize the code using C++11/C++14
+  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
+  2. move constructor
+*/
+
+namespace std {
+/*
+   Specialize std::complex<ap_fixed> to zero initialization ap_fixed.
+
+   To reduce the area cost, ap_fixed is not zero initialized, just like basic
+   types float or double. However, libstdc++ provides specialization for float,
+   double and long double, initializing image part to 0 when not specified.
+
+   This has become a difficulty in switching legacy code from these C types to
+   ap_fixed. To ease the tranform of legacy code, we have to implement
+   specialization of std::complex<> for our type.
+
+   As ap_fixed is a template, it is impossible to specialize only the methods
+   that causes default initialization of value type in std::complex<>. An
+   explicit full specialization of the template class has to be done, covering
+   all the member functions and operators of std::complex<> as specified
+   in standard 26.2.4 and 26.2.5.
+*/
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > {
+ public:
+  typedef ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> _Tp;
+  typedef _Tp value_type;
+
+  // 26.2.4/1
+  // Constructor without argument
+  // Default initialize, so that in dataflow, the variable is only written once.
+  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
+  // Constructor with ap_fixed.
+  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
+  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
+      : _M_real(__r), _M_imag(__i) {}
+
+  // Constructor with another complex number
+  template <typename _Up>
+  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
+
+#if __cplusplus >= 201103L
+  const _Tp& real() const { return _M_real; }
+  const _Tp& imag() const { return _M_imag; }
+#else
+  _Tp& real() { return _M_real; }
+  const _Tp& real() const { return _M_real; }
+  _Tp& imag() { return _M_imag; }
+  const _Tp& imag() const { return _M_imag; }
+#endif
+ 
+  void real(_Tp __val) { _M_real = __val; }
+
+  void imag(_Tp __val) { _M_imag = __val; }
+
+  // Assign this complex number with ap_fixed.
+  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
+  complex<_Tp> &operator=(const _Tp __t) {
+    _M_real = __t;
+    _M_imag = _Tp(0);
+    return *this;
+  }
+
+  // 26.2.5/1
+  // Add ap_fixed to this complex number.
+  complex<_Tp> &operator+=(const _Tp &__t) {
+    _M_real += __t;
+    return *this;
+  }
+
+  // 26.2.5/3
+  // Subtract ap_fixed from this complex number.
+  complex<_Tp> &operator-=(const _Tp &__t) {
+    _M_real -= __t;
+    return *this;
+  }
+
+  // 26.2.5/5
+  // Multiply this complex number by ap_fixed.
+  complex<_Tp> &operator*=(const _Tp &__t) {
+    _M_real *= __t;
+    _M_imag *= __t;
+    return *this;
+  }
+
+  // 26.2.5/7
+  // Divide this complex number by ap_fixed.
+  complex<_Tp> &operator/=(const _Tp &__t) {
+    _M_real /= __t;
+    _M_imag /= __t;
+    return *this;
+  }
+
+  // Assign complex number to this complex number.
+  template <typename _Up>
+  complex<_Tp> &operator=(const complex<_Up> &__z) {
+    _M_real = __z.real();
+    _M_imag = __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/9
+  // Add complex number to this.
+  template <typename _Up>
+  complex<_Tp> &operator+=(const complex<_Up> &__z) {
+    _M_real += __z.real();
+    _M_imag += __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/11
+  // Subtract complex number from this.
+  template <typename _Up>
+  complex<_Tp> &operator-=(const complex<_Up> &__z) {
+    _M_real -= __z.real();
+    _M_imag -= __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/13
+  // Multiply this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator*=(const complex<_Up> &__z) {
+    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
+    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
+    _M_real = __r;
+    return *this;
+  }
+
+  // 26.2.5/15
+  // Divide this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator/=(const complex<_Up> &__z) {
+    complex<_Tp> cj (__z.real(), -__z.imag());
+    complex<_Tp> a = (*this) * cj;
+    complex<_Tp> b = cj * __z;
+    _M_real = a.real() / b.real();
+    _M_imag = a.imag() / b.real();
+    return *this;
+  }
+
+ private:
+  _Tp _M_real;
+  _Tp _M_imag;
+
+}; // class complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> >
+
+/*
+   Non-member operations
+   These operations are not required by standard in 26.2.6, but libstdc++
+   defines them for
+   float, double or long double's specialization.
+*/
+// Compare complex number with ap_fixed.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator==(
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
+  return __x.real() == __y &&
+         __x.imag() == 0;
+}
+
+// Compare ap_fixed with complex number.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator==(
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
+  return __x == __y.real() &&
+         0 == __y.imag();
+}
+
+// Compare complex number with ap_fixed.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator!=(
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__x,
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__y) {
+  return __x.real() != __y ||
+         __x.imag() != 0;
+}
+
+// Compare ap_fixed with complex number.
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+inline bool operator!=(
+    const ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> &__x,
+    const complex<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N> > &__y) {
+  return __x != __y.real() ||
+         0 != __y.imag();
+}
+
+}  // namespace std
+
+#endif  // ifndef __AP_FIXED_SPECIAL_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int.h b/hls4ml/templates/vivado/ap_types/ap_int.h
index d103795b46..db3044d48c 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int.h
@@ -1,330 +1,330 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_H__
-#define __AP_INT_H__
-
-#include <ap_common.h>
-#include <ap_int_base.h>
-#include <ap_int_ref.h>
-
-//---------------------------------------------------------------
-
-/// Sign Arbitrary Precision Type.
-template <int _AP_W>
-struct ap_int : ap_int_base<_AP_W, true> {
-  typedef ap_int_base<_AP_W, true> Base;
-  // Constructor
-  INLINE ap_int() : Base() {}
-
-  // Copy ctor
-  INLINE ap_int(const ap_int& op) { Base::V = op.V; }
-
-  template <int _AP_W2>
-  INLINE ap_int(const ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int(const volatile ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int(const ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int(const volatile ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
-      : Base(ref) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(
-      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_int(
-      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-#define CTOR(TYPE) \
-  INLINE ap_int(TYPE val) { Base::V = val; }
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#undef CTOR
-  ap_int(double val) : Base(val) {}
-  ap_int(float val) : Base(val) {}
-#if _AP_ENABLE_HALF_ == 1
-  ap_int(half val) : Base(val) {}
-#endif
-
-  // ap_int_base will guess radix if radix is not provided.
-  INLINE ap_int(const char* s) : Base(s) {}
-
-  INLINE ap_int(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  /* ctor will be used when right is not of proper type. */
-
-  INLINE ap_int& operator=(const ap_int<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot bind volatile reference to non-volatile type. */
-  INLINE ap_int& operator=(const volatile ap_int<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot return volatile *this. */
-  INLINE void operator=(const ap_int<_AP_W>& op2) volatile { Base::V = op2.V; }
-
-  INLINE void operator=(const volatile ap_int<_AP_W>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-}; // struct ap_int.
-
-//---------------------------------------------------------------
-
-/// Unsigned Arbitrary Precision Type.
-template <int _AP_W>
-struct ap_uint : ap_int_base<_AP_W, false> {
-  typedef ap_int_base<_AP_W, false> Base;
-  // Constructor
-  INLINE ap_uint() : Base() {}
-
-  // Copy ctor
-  INLINE ap_uint(const ap_uint& op) { Base::V = op.V; }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const volatile ap_uint<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_uint(const volatile ap_int<_AP_W2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_uint(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_uint(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_uint(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
-      : Base(ref) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(
-      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
-
-  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
-            int _AP_N2>
-  INLINE ap_uint(
-      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_uint(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_uint(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_uint(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_uint(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
-      : Base(op) {}
-
-#define CTOR(TYPE) \
-  INLINE ap_uint(TYPE val) { Base::V = val; }
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#undef CTOR
-  ap_uint(double val) : Base(val) {}
-  ap_uint(float val) : Base(val) {}
-#if _AP_ENABLE_HALF_ == 1
-  ap_uint(half val) : Base(val) {}
-#endif
-
-  // ap_int_base will guess radix if radix is not provided.
-  INLINE ap_uint(const char* s) : Base(s) {}
-
-  INLINE ap_uint(const char* s, signed char rd) : Base(s, rd) {}
-
-  // Assignment
-  /* XXX ctor will be used when right is not of proper type. */
-
-  INLINE ap_uint& operator=(const ap_uint<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot bind volatile reference to non-volatile type. */
-  INLINE ap_uint& operator=(const volatile ap_uint<_AP_W>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  /* cannot return volatile *this. */
-  INLINE void operator=(const ap_uint<_AP_W>& op2) volatile { Base::V = op2.V; }
-
-  INLINE void operator=(const volatile ap_uint<_AP_W>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-}; // struct ap_uint.
-
-#define ap_bigint ap_int
-#define ap_biguint ap_uint
-
-#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
-// XXX sc_trace overload for ap_fixed is already included in
-// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
-template <int _AP_W>
-INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_int<_AP_W>& op,
-                     const std::string& name) {
-  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-
-template <int _AP_W>
-INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_uint<_AP_W>& op,
-                     const std::string& name) {
-  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
-}
-#endif // System C sim
-
-#include <ap_int_special.h>
-
-#endif // ifndef __AP_INT_H__ else
-
-// FIXME user should include ap_fixed.h when using ap_fixed.
-// to avoid circular inclusion, must check whether this is required by
-// ap_fixed.h
-#ifndef __AP_FIXED_H__
-#include <ap_fixed.h>
-#endif
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_H__
+#define __AP_INT_H__
+
+#include <ap_common.h>
+#include <ap_int_base.h>
+#include <ap_int_ref.h>
+
+//---------------------------------------------------------------
+
+/// Sign Arbitrary Precision Type.
+template <int _AP_W>
+struct ap_int : ap_int_base<_AP_W, true> {
+  typedef ap_int_base<_AP_W, true> Base;
+  // Constructor
+  INLINE ap_int() : Base() {}
+
+  // Copy ctor
+  INLINE ap_int(const ap_int& op) { Base::V = op.V; }
+
+  template <int _AP_W2>
+  INLINE ap_int(const ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const volatile ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int(const volatile ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
+      : Base(ref) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(
+      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_int(
+      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_int(TYPE val) { Base::V = val; }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#undef CTOR
+  ap_int(double val) : Base(val) {}
+  ap_int(float val) : Base(val) {}
+#if _AP_ENABLE_HALF_ == 1
+  ap_int(half val) : Base(val) {}
+#endif
+
+  // ap_int_base will guess radix if radix is not provided.
+  INLINE ap_int(const char* s) : Base(s) {}
+
+  INLINE ap_int(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  /* ctor will be used when right is not of proper type. */
+
+  INLINE ap_int& operator=(const ap_int<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot bind volatile reference to non-volatile type. */
+  INLINE ap_int& operator=(const volatile ap_int<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot return volatile *this. */
+  INLINE void operator=(const ap_int<_AP_W>& op2) volatile { Base::V = op2.V; }
+
+  INLINE void operator=(const volatile ap_int<_AP_W>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+}; // struct ap_int.
+
+//---------------------------------------------------------------
+
+/// Unsigned Arbitrary Precision Type.
+template <int _AP_W>
+struct ap_uint : ap_int_base<_AP_W, false> {
+  typedef ap_int_base<_AP_W, false> Base;
+  // Constructor
+  INLINE ap_uint() : Base() {}
+
+  // Copy ctor
+  INLINE ap_uint(const ap_uint& op) { Base::V = op.V; }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const volatile ap_uint<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_uint(const volatile ap_int<_AP_W2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_range_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_bit_ref<_AP_W2, _AP_S2>& ref) : Base(ref) {}
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_uint(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref)
+      : Base(ref) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(const ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(const ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(
+      const volatile ap_fixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, true, _AP_Q2, _AP_O2, _AP_N2>)op) {}
+
+  template <int _AP_W2, int _AP_I2, ap_q_mode _AP_Q2, ap_o_mode _AP_O2,
+            int _AP_N2>
+  INLINE ap_uint(
+      const volatile ap_ufixed<_AP_W2, _AP_I2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base((ap_fixed_base<_AP_W2, _AP_I2, false, _AP_Q2, _AP_O2, _AP_N2>)op) {
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_uint(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_uint(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op)
+      : Base(op) {}
+
+#define CTOR(TYPE) \
+  INLINE ap_uint(TYPE val) { Base::V = val; }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#undef CTOR
+  ap_uint(double val) : Base(val) {}
+  ap_uint(float val) : Base(val) {}
+#if _AP_ENABLE_HALF_ == 1
+  ap_uint(half val) : Base(val) {}
+#endif
+
+  // ap_int_base will guess radix if radix is not provided.
+  INLINE ap_uint(const char* s) : Base(s) {}
+
+  INLINE ap_uint(const char* s, signed char rd) : Base(s, rd) {}
+
+  // Assignment
+  /* XXX ctor will be used when right is not of proper type. */
+
+  INLINE ap_uint& operator=(const ap_uint<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot bind volatile reference to non-volatile type. */
+  INLINE ap_uint& operator=(const volatile ap_uint<_AP_W>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  /* cannot return volatile *this. */
+  INLINE void operator=(const ap_uint<_AP_W>& op2) volatile { Base::V = op2.V; }
+
+  INLINE void operator=(const volatile ap_uint<_AP_W>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+}; // struct ap_uint.
+
+#define ap_bigint ap_int
+#define ap_biguint ap_uint
+
+#if !defined(__SYNTHESIS__) && (defined(SYSTEMC_H) || defined(SYSTEMC_INCLUDED))
+// XXX sc_trace overload for ap_fixed is already included in
+// "ap_sysc/ap_sc_extras.h", so do not define in synthesis.
+template <int _AP_W>
+INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_int<_AP_W>& op,
+                     const std::string& name) {
+  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+
+template <int _AP_W>
+INLINE void sc_trace(sc_core::sc_trace_file* tf, const ap_uint<_AP_W>& op,
+                     const std::string& name) {
+  if (tf) tf->trace(sc_dt::sc_lv<_AP_W>(op.to_string(2).c_str()), name);
+}
+#endif // System C sim
+
+#include <ap_int_special.h>
+
+#endif // ifndef __AP_INT_H__ else
+
+// FIXME user should include ap_fixed.h when using ap_fixed.
+// to avoid circular inclusion, must check whether this is required by
+// ap_fixed.h
+#ifndef __AP_FIXED_H__
+#include <ap_fixed.h>
+#endif
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int_base.h b/hls4ml/templates/vivado/ap_types/ap_int_base.h
index bb7e286ab6..091552a881 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int_base.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int_base.h
@@ -1,1885 +1,1885 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_BASE_H__
-#define __AP_INT_BASE_H__
-
-#ifndef __AP_INT_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-#else
-
-#include <ap_common.h>
-#ifndef __SYNTHESIS__
-#if _AP_ENABLE_HALF_ == 1
-#include <hls_half.h>
-#endif
-#include <iostream>
-#include <string.h>
-#endif
-
-/* ----------------------------------------------------------------
- * ap_int_base: AutoPilot integer/Arbitrary precision integer.
- * ----------------------------------------------------------------
- */
-
-/* helper trait. Selecting the smallest C type that can hold the value,
- * return 64 bit C type if not possible.
- */
-template <int _AP_N, bool _AP_S>
-struct retval;
-
-// at least 64 bit
-template <int _AP_N>
-struct retval<_AP_N, true> {
-  typedef ap_slong Type;
-};
-
-template <int _AP_N>
-struct retval<_AP_N, false> {
-  typedef ap_ulong Type;
-};
-
-// at least 8 bit
-template <>
-struct retval<1, true> {
-  typedef signed char Type;
-};
-
-template <>
-struct retval<1, false> {
-  typedef unsigned char Type;
-};
-
-// at least 16 bit
-template <>
-struct retval<2, true> {
-  typedef short Type;
-};
-
-template <>
-struct retval<2, false> {
-  typedef unsigned short Type;
-};
-
-// at least 32 bit
-template <>
-struct retval<3, true> {
-  typedef long Type;
-};
-
-template <>
-struct retval<3, false> {
-  typedef unsigned long Type;
-};
-
-template <>
-struct retval<4, true> {
-  typedef long Type;
-};
-
-template <>
-struct retval<4, false> {
-  typedef unsigned long Type;
-};
-
-// trait for letting base class to return derived class.
-// Notice that derived class template is incomplete, and we cannot use
-// the member of the derived class.
-template <int _AP_W2, bool _AP_S2>
-struct _ap_int_factory;
-template <int _AP_W2>
-struct _ap_int_factory<_AP_W2,true> { typedef ap_int<_AP_W2> type; };
-template <int _AP_W2>
-struct _ap_int_factory<_AP_W2,false> { typedef ap_uint<_AP_W2> type; };
-
-template <int _AP_W, bool _AP_S>
-struct ap_int_base : public _AP_ROOT_TYPE<_AP_W, _AP_S> {
- public:
-  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
-
-  /* ap_int_base<_AP_W, _AP_S, true>
-   * typedef typename retval<(_AP_W + 7) / 8, _AP_S>::Type RetType;
-   *
-   * ap_int_base<_AP_W, _AP_S, false>
-   * typedef typename retval<8, _AP_S>::Type RetType;
-   */
-  typedef typename retval<AP_MAX((_AP_W + 7) / 8, 8), _AP_S>::Type RetType;
-
-  static const int width = _AP_W;
-
-  template <int _AP_W2, bool _AP_S2>
-  struct RType {
-    enum {
-      mult_w = _AP_W + _AP_W2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-      div_w = _AP_W + _AP_S2,
-      div_s = _AP_S || _AP_S2,
-      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
-      mod_s = _AP_S,
-      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-
-
-    typedef ap_int_base<mult_w, mult_s> mult_base;
-    typedef ap_int_base<plus_w, plus_s> plus_base;
-    typedef ap_int_base<minus_w, minus_s> minus_base;
-    typedef ap_int_base<logic_w, logic_s> logic_base;
-    typedef ap_int_base<div_w, div_s> div_base;
-    typedef ap_int_base<mod_w, mod_s> mod_base;
-    typedef ap_int_base<_AP_W, _AP_S> arg1_base;
-
-    typedef typename _ap_int_factory<mult_w, mult_s>::type mult;
-    typedef typename _ap_int_factory<plus_w, plus_s>::type plus;
-    typedef typename _ap_int_factory<minus_w, minus_s>::type minus;
-    typedef typename _ap_int_factory<logic_w, logic_s>::type logic;
-    typedef typename _ap_int_factory<div_w, div_s>::type div;
-    typedef typename _ap_int_factory<mod_w, mod_s>::type mod;
-    typedef typename _ap_int_factory<_AP_W, _AP_S>::type arg1;
-    typedef bool reduce;
-  };
-
-  /* Constructors.
-   * ----------------------------------------------------------------
-   */
-  /// default ctor
-  INLINE ap_int_base() {
-    /*
-      #ifdef __SC_COMPATIBLE__
-      Base::V = 0;
-      #endif
-    */
-  }
-
-  /// copy ctor
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-  /// volatile copy ctor
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
-    Base::V = op.V;
-  }
-
-// XXX C++11 feature.
-// The explicit specifier specifies that a constructor or conversion function
-// (since C++11) doesn't allow implicit conversions or copy-initialization.
-//   ap_int_base<W,S> x = 1;
-//   ap_int_base<W,S> foo() { return 1; }
-// but allows
-//   ap_int_base<W,S> x(1);
-//   ap_int_base<W,S> y {1};
-
-/// from all c types.
-#define CTOR_FROM_INT(Type, Size, Signed) \
-  INLINE ap_int_base(const Type op) { Base::V = op; }
-
-  CTOR_FROM_INT(bool, 1, false)
-  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  CTOR_FROM_INT(signed char, 8, true)
-  CTOR_FROM_INT(unsigned char, 8, false)
-  CTOR_FROM_INT(short, _AP_SIZE_short, true)
-  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
-  CTOR_FROM_INT(int, _AP_SIZE_int, true)
-  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
-  CTOR_FROM_INT(long, _AP_SIZE_long, true)
-  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
-  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
-  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-#undef CTOR_FROM_INT
-
-#if _AP_ENABLE_HALF_ == 1
-  /// ctor from half.
-  //  TODO optimize
-  INLINE ap_int_base(half op) {
-    ap_int_base<_AP_W, _AP_S> t((float)op);
-    Base::V = t.V;
-  }
-#endif
-
-  /// ctor from float.
-  INLINE ap_int_base(float op) {
-    const int BITS = FLOAT_MAN + FLOAT_EXP + 1;
-    ap_int_base<BITS, false> reg;
-    reg.V = floatToRawBits(op);
-    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
-
-    ap_int_base<FLOAT_EXP + 1, true> exp = 0;
-    exp.V = _AP_ROOT_op_get_range(reg.V, FLOAT_MAN, BITS - 2);
-    exp = exp - FLOAT_BIAS;
-
-    ap_int_base<FLOAT_MAN + 2, true> man;
-    man.V = _AP_ROOT_op_get_range(reg.V, 0, FLOAT_MAN - 1);
-    // check for NaN
-    _AP_WARNING(exp == ((unsigned char)(FLOAT_BIAS + 1)) && man.V != 0,
-                "assign NaN to ap integer value");
-    // set leading 1.
-    man.V = _AP_ROOT_op_set_bit(man.V, FLOAT_MAN, 1);
-    //if (is_neg) man = -man;
-
-    if ((reg.V & 0x7ffffffful) == 0) {
-      Base::V = 0;
-    } else {
-      int sh_amt = FLOAT_MAN - exp.V;
-      if (sh_amt == 0) {
-        Base::V = man.V;
-      } else if (sh_amt > 0) {
-        if (sh_amt < FLOAT_MAN + 2) {
-          Base::V = man.V >> sh_amt;
-        } else {
-          if (is_neg)
-            Base::V = -1;
-          else
-            Base::V = 0;
-        }
-      } else {
-        sh_amt = -sh_amt;
-        if (sh_amt < _AP_W) {
-          Base::V = man.V;
-          Base::V <<= sh_amt;
-        } else {
-          Base::V = 0;
-        }
-      }
-    }
-    if (is_neg) *this = -(*this);
-  }
-
-  /// ctor from double.
-  INLINE ap_int_base(double op) {
-    const int BITS = DOUBLE_MAN + DOUBLE_EXP + 1;
-    ap_int_base<BITS, false> reg;
-    reg.V = doubleToRawBits(op);
-    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
-
-    ap_int_base<DOUBLE_EXP + 1, true> exp = 0;
-    exp.V = _AP_ROOT_op_get_range(reg.V, DOUBLE_MAN, BITS - 2);
-    exp = exp - DOUBLE_BIAS;
-
-    ap_int_base<DOUBLE_MAN + 2, true> man;
-    man.V = _AP_ROOT_op_get_range(reg.V, 0, DOUBLE_MAN - 1);
-    // check for NaN
-    _AP_WARNING(exp == ((unsigned char)(DOUBLE_BIAS + 1)) && man.V != 0,
-                "assign NaN to ap integer value");
-    // set leading 1.
-    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
-    //if (is_neg) man = -man;
-
-    if ((reg.V & 0x7fffffffffffffffull) == 0) {
-      Base::V = 0;
-    } else {
-      int sh_amt = DOUBLE_MAN - exp.V;
-      if (sh_amt == 0) {
-        Base::V = man.V;
-      } else if (sh_amt > 0) {
-        if (sh_amt < DOUBLE_MAN + 2) {
-          Base::V = man.V >> sh_amt;
-        } else {
-          if (is_neg)
-            Base::V = -1;
-          else
-            Base::V = 0;
-        }
-      } else {
-        sh_amt = -sh_amt;
-        if (sh_amt < _AP_W) {
-          Base::V = man.V;
-          Base::V <<= sh_amt;
-        } else {
-          Base::V = 0;
-        }
-      }
-    }
-    if (is_neg) *this = -(*this);
-  }
-
-  /// from higer rank type.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = op.to_ap_int_base().V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const ap_range_ref<_AP_W2, _AP_S2>& ref) {
-    Base::V = (ref.get()).V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base(const ap_bit_ref<_AP_W2, _AP_S2>& ref) {
-    Base::V = ref.operator bool();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
-    const ap_int_base<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>::_AP_WR,
-                      false>
-        tmp = ref.get();
-    Base::V = tmp.V;
-  }
-
-  /* radix has default value in set */
-
-#ifndef __SYNTHESIS__
-  INLINE ap_int_base(const char* s, signed char rd = 0) {
-    if (rd == 0)
-      rd = guess_radix(s);
-    unsigned int length = strlen(s);
-    Base::V.fromString(s, length, rd);
-  }
-#else
-  // XXX __builtin_bit_from_string(...) requires const C string and radix.
-  INLINE ap_int_base(const char* s) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_W, _AP_S,
-                      AP_TRN, AP_WRAP, 0, _AP_C99);
-    Base::V = t;
-  }
-  INLINE ap_int_base(const char* s, signed char rd) {
-    typeof(Base::V) t;
-    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_W, _AP_S,
-                      AP_TRN, AP_WRAP, 0, _AP_C99);
-    Base::V = t;
-  }
-#endif
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    Base::V = (val.get()).V;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    Base::V = val.operator bool();
-  }
-
-  INLINE ap_int_base read() volatile {
-    /*AP_DEBUG(printf("call read %d\n", Base::V););*/
-    ap_int_base ret;
-    ret.V = Base::V;
-    return ret;
-  }
-
-  INLINE void write(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
-    /*AP_DEBUG(printf("call write %d\n", op2.V););*/
-    Base::V = op2.V;
-  }
-
-  /* Another form of "write".*/
-  template <int _AP_W2, bool _AP_S2>
-  INLINE void operator=(
-      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  INLINE void operator=(
-      const volatile ap_int_base<_AP_W, _AP_S>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE void operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  INLINE void operator=(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
-    Base::V = op2.V;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(
-      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  INLINE ap_int_base& operator=(const volatile ap_int_base<_AP_W, _AP_S>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-  INLINE ap_int_base& operator=(const ap_int_base<_AP_W, _AP_S>& op2) {
-    Base::V = op2.V;
-    return *this;
-  }
-
-
-#define ASSIGN_OP_FROM_INT(Type, Size, Signed) \
-  INLINE ap_int_base& operator=(Type op) {     \
-    Base::V = op;                              \
-    return *this;                              \
-  }
-
-  ASSIGN_OP_FROM_INT(bool, 1, false)
-  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  ASSIGN_OP_FROM_INT(signed char, 8, true)
-  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
-  ASSIGN_OP_FROM_INT(short, _AP_SIZE_short, true)
-  ASSIGN_OP_FROM_INT(unsigned short, _AP_SIZE_short, false)
-  ASSIGN_OP_FROM_INT(int, _AP_SIZE_int, true)
-  ASSIGN_OP_FROM_INT(unsigned int, _AP_SIZE_int, false)
-  ASSIGN_OP_FROM_INT(long, _AP_SIZE_long, true)
-  ASSIGN_OP_FROM_INT(unsigned long, _AP_SIZE_long, false)
-  ASSIGN_OP_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
-  ASSIGN_OP_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef ASSIGN_OP_FROM_INT
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& op2) {
-    Base::V = (bool)op2;
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    Base::V = (ap_int_base<_AP_W2, false>(op2)).V;
-    return *this;
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op2) {
-    Base::V = op2.get().V;
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = op.to_ap_int_base().V;
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = (bool)op;
-    return *this;
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_int_base& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
-    Base::V = ((const ap_int_base<_AP_W2, false>)(op)).V;
-    return *this;
-  }
-
-  // FIXME: UG902 has clearly required user to use to_int() to convert to built-in
-  // types, but this implicit conversion is relied on in hls_cordic.h and hls_rsr.h.
-  // For example:
-  //     int d_exp = fps_x.exp - fps_y.exp;
-  INLINE operator RetType() const { return (RetType)(Base::V); }
-
-  /* Explicit conversions to C types.
-   * ----------------------------------------------------------------
-   */
-  INLINE bool to_bool() const { return (bool)(Base::V); }
-  INLINE char to_char() const { return (char)(Base::V); }
-  INLINE signed char to_schar() const { return (signed char)(Base::V); }
-  INLINE unsigned char to_uchar() const { return (unsigned char)(Base::V); }
-  INLINE short to_short() const { return (short)(Base::V); }
-  INLINE unsigned short to_ushort() const { return (unsigned short)(Base::V); }
-  INLINE int to_int() const { return (int)(Base::V); }
-  INLINE unsigned to_uint() const { return (unsigned)(Base::V); }
-  INLINE long to_long() const { return (long)(Base::V); }
-  INLINE unsigned long to_ulong() const { return (unsigned long)(Base::V); }
-  INLINE ap_slong to_int64() const { return (ap_slong)(Base::V); }
-  INLINE ap_ulong to_uint64() const { return (ap_ulong)(Base::V); }
-  INLINE float to_float() const { return (float)(Base::V); }
-  INLINE double to_double() const { return (double)(Base::V); }
-
-  // TODO decide if user-defined conversion should be provided.
-#if 0
-  INLINE operator char() const { return (char)(Base::V); }
-  INLINE operator signed char() const { return (signed char)(Base::V); }
-  INLINE operator unsigned char() const { return (unsigned char)(Base::V); }
-  INLINE operator short() const { return (short)(Base::V); }
-  INLINE operator unsigned short() const { return (unsigned short)(Base::V); }
-  INLINE operator int() const { return (int)(Base::V); }
-  INLINE operator unsigned int () const { return (unsigned)(Base::V); }
-  INLINE operator long () const { return (long)(Base::V); }
-  INLINE operator unsigned long () const { return (unsigned long)(Base::V); }
-  INLINE operator ap_slong () { return (ap_slong)(Base::V); }
-  INLINE operator ap_ulong () { return (ap_ulong)(Base::V); }
-#endif
-
-  /* Helper methods.
-     ----------------------------------------------------------------
-  */
-  /* we cannot call a non-volatile function on a volatile instance.
-   * but calling a volatile function is ok.
-   * XXX deleted non-volatile version.
-   */
-  INLINE int length() const volatile { return _AP_W; }
-
-  /*Return true if the value of ap_int_base instance is zero*/
-  INLINE bool iszero() const { return Base::V == 0; }
-
-  /*Return true if the value of ap_int_base instance is zero*/
-  INLINE bool is_zero() const { return Base::V == 0; }
-
-  /* x < 0 */
-  INLINE bool sign() const {
-    if (_AP_S &&
-        _AP_ROOT_op_get_bit(Base::V, _AP_W - 1))
-      return true;
-    else
-      return false;
-  }
-
-  /* x[i] = 0 */
-  INLINE void clear(int i) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
-  }
-
-  /* x[i] = !x[i]*/
-  INLINE void invert(int i) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    bool val = _AP_ROOT_op_get_bit(Base::V, i);
-    if (val)
-      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
-    else
-      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
-  }
-
-  INLINE bool test(int i) const {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    return _AP_ROOT_op_get_bit(Base::V, i);
-  }
-
-  // Get self. For ap_concat_ref expansion.
-  INLINE ap_int_base& get() { return *this; }
-
-  // Set the ith bit into 1
-  INLINE void set(int i) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
-  }
-
-  // Set the ith bit into v
-  INLINE void set(int i, bool v) {
-    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_int_base object n places to the left
-  INLINE ap_int_base& lrotate(int n) {
-    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
-    // TODO unify this.
-#ifdef __SYNTHESIS__
-    typeof(Base::V) l_p = Base::V << n;
-    typeof(Base::V) r_p = Base::V >> (_AP_W - n);
-    Base::V = l_p | r_p;
-#else
-    Base::V.lrotate(n);
-#endif
-    return *this;
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_int_base object n places to the right
-  INLINE ap_int_base& rrotate(int n) {
-    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
-    // TODO unify this.
-#ifdef __SYNTHESIS__
-    typeof(Base::V) l_p = Base::V << (_AP_W - n);
-    typeof(Base::V) r_p = Base::V >> n;
-    Base::V = l_p | r_p;
-#else
-    Base::V.rrotate(n);
-#endif
-    return *this;
-  }
-
-  // Reverse the contents of ap_int_base instance.
-  // I.e. LSB becomes MSB and vise versa.
-  INLINE ap_int_base& reverse() {
-    Base::V = _AP_ROOT_op_get_range(Base::V, _AP_W - 1, 0);
-    return *this;
-  }
-
-  // Set the ith bit into v
-  INLINE void set_bit(int i, bool v) {
-    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
-  }
-
-  // Get the value of ith bit
-  INLINE bool get_bit(int i) const {
-    return (bool)_AP_ROOT_op_get_bit(Base::V, i);
-  }
-
-  // complements every bit
-  INLINE void b_not() { Base::V = ~Base::V; }
-
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    Base::V Sym op2.V;                                                       \
-    return *this;                                                            \
-  }
-
-  /* Arithmetic assign.
-   * ----------------------------------------------------------------
-   */
-  OP_ASSIGN_AP(*=)
-  OP_ASSIGN_AP(+=)
-  OP_ASSIGN_AP(-=)
-  OP_ASSIGN_AP(/=)
-  OP_ASSIGN_AP(%=)
-#undef OP_ASSIGN_AP
-
-  /* Bitwise assign: and, or, xor.
-   * ----------------------------------------------------------------
-   */
-#define OP_ASSIGN_AP_CHK(Sym)                                                \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    _AP_WARNING((_AP_W != _AP_W2),                                           \
-                "Bitsize mismatch for ap_[u]int" #Sym "ap_[u]int.");         \
-    Base::V Sym op2.V;                                                       \
-    return *this;                                                            \
-  }
-  OP_ASSIGN_AP_CHK(&=)
-  OP_ASSIGN_AP_CHK(|=)
-  OP_ASSIGN_AP_CHK(^=)
-#undef OP_ASSIGN_AP_CHK
-
-  /* Prefix increment, decrement.
-   * ----------------------------------------------------------------
-   */
-  INLINE ap_int_base& operator++() {
-    operator+=((ap_int_base<1, false>)1);
-    return *this;
-  }
-  INLINE ap_int_base& operator--() {
-    operator-=((ap_int_base<1, false>)1);
-    return *this;
-  }
-
-  /* Postfix increment, decrement
-   * ----------------------------------------------------------------
-   */
-  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator++(int) {
-    ap_int_base t = *this;
-    operator+=((ap_int_base<1, false>)1);
-    return t;
-  }
-  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator--(int) {
-    ap_int_base t = *this;
-    operator-=((ap_int_base<1, false>)1);
-    return t;
-  }
-
-  /* Unary arithmetic.
-   * ----------------------------------------------------------------
-   */
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator+() const { return *this; }
-
-  // TODO used to be W>64 only... need check.
-  INLINE typename RType<1, false>::minus operator-() const {
-    return ap_int_base<1, false>(0) - *this;
-  }
-
-  /* Not (!)
-   * ----------------------------------------------------------------
-   */
-  INLINE bool operator!() const { return Base::V == 0; }
-
-  /* Bitwise (arithmetic) unary: complement
-     ----------------------------------------------------------------
-  */
-  // XXX different from Mentor's ac_int!
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator~() const {
-    ap_int_base<_AP_W, _AP_S> r;
-    r.V = ~Base::V;
-    return r;
-  }
-
-  /* Shift (result constrained by left operand).
-   * ----------------------------------------------------------------
-   */
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, true>& op2) const {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator>>(sh);
-    } else
-      return operator<<(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, false>& op2) const {
-    ap_int_base r;
-    r.V = Base::V << op2.to_uint();
-    return r;
-  }
-
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, true>& op2) const {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator<<(sh);
-    }
-    return operator>>(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, false>& op2) const {
-    ap_int_base r;
-    r.V = Base::V >> op2.to_uint();
-    return r;
-  }
-
-  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
-#if 0
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base operator<<(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
-    return *this << (op2.operator ap_int_base<_AP_W2, false>());
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base operator>>(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
-    return *this >> (op2.operator ap_int_base<_AP_W2, false>());
-  }
-#endif
-
-  /* Shift assign
-   * ----------------------------------------------------------------
-   */
-  template <int _AP_W2>
-  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, true>& op2) {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator>>=(sh);
-    } else
-      return operator<<=(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, false>& op2) {
-    Base::V <<= op2.to_uint();
-    return *this;
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, true>& op2) {
-    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
-    ap_int_base<_AP_W2, false> sh = op2;
-    if (isNeg) {
-      sh = -op2;
-      return operator<<=(sh);
-    }
-    return operator>>=(sh);
-  }
-
-  template <int _AP_W2>
-  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, false>& op2) {
-    Base::V >>= op2.to_uint();
-    return *this;
-  }
-
-  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
-#if 0
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator<<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return *this <<= (op2.operator ap_int_base<_AP_W2, false>());
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_int_base& operator>>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return *this >>= (op2.operator ap_int_base<_AP_W2, false>());
-  }
-#endif
-
-  /* Equality and Relational.
-   * ----------------------------------------------------------------
-   */
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V == op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return !(Base::V == op2.V);
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V < op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V >= op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V > op2.V;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
-    return Base::V <= op2.V;
-  }
-
-  /* Bit and Part Select
-   * ----------------------------------------------------------------
-   */
-  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
-    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
-    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
-    return ap_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  // This is a must to strip constness to produce reference type.
-  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
-    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
-    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
-    return ap_range_ref<_AP_W, _AP_S>(const_cast<ap_int_base*>(this), Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> range(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> range() {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> range() const {
-    return this->range(_AP_W - 1, 0);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
-    return this->range(Hi, Lo);
-  }
-
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
-      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
-      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-#if 0
-  template<int Hi, int Lo>
-  INLINE ap_int_base<Hi-Lo+1, false> slice() const {
-    AP_ASSERT(Hi >= Lo && Hi < _AP_W && Lo < _AP_W, "Out of bounds in slice()");
-    ap_int_base<Hi-Lo+1, false> tmp ;
-    tmp.V = _AP_ROOT_op_get_range(Base::V, Lo, Hi);
-    return tmp;
-  }
-
-  INLINE ap_bit_ref<_AP_W,_AP_S> operator [] ( unsigned int uindex) {
-    AP_ASSERT(uindex < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W,_AP_S> bvh( this, uindex );
-    return bvh;
-  }
-#endif
-
-  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](int index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
-    return bvh;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
-    return bvh;
-  }
-
-  INLINE bool operator[](int index) const {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> br(this, index);
-    return br.to_bool();
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator[](const ap_int_base<_AP_W2, _AP_S2>& index) const {
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> br(this, index.to_int());
-    return br.to_bool();
-  }
-
-  INLINE ap_bit_ref<_AP_W, _AP_S> bit(int index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
-    return bvh;
-  }
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref<_AP_W, _AP_S> bit(
-      const ap_int_base<_AP_W2, _AP_S2>& index) {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
-    return bvh;
-  }
-
-  INLINE bool bit(int index) const {
-    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W, _AP_S> br(this, index);
-    return br.to_bool();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool bit(const ap_int_base<_AP_W2, _AP_S2>& index) const {
-    return bit(index.to_int());
-  }
-
-#if 0
-  template<typename _AP_T>
-  INLINE bool operator[](_AP_T index) const {
-    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
-    ap_bit_ref<_AP_W,_AP_S> br = operator[](index);
-    return br.to_bool();
-  }
-#endif
-
-  // Count the number of zeros from the most significant bit
-  // to the first one bit.
-  INLINE int countLeadingZeros() {
-#ifdef __SYNTHESIS__
-    if (_AP_W <= 32) {
-      ap_int_base<32, false> t(-1UL), x;
-      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
-      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
-      return __builtin_ctz(t.V); // count trailing zeros.
-    } else if (_AP_W <= 64) {
-      ap_int_base<64, false> t(-1ULL);
-      ap_int_base<64, false> x;
-      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
-      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
-      return __builtin_ctzll(t.V); // count trailing zeros.
-    } else {
-      enum { __N = (_AP_W + 63) / 64 };
-      int NZeros = 0;
-      int i = 0;
-      bool hitNonZero = false;
-      for (i = 0; i < __N - 1; ++i) {
-        ap_int_base<64, false> t;
-        t.V = _AP_ROOT_op_get_range(this->V, _AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
-        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); // count leading zeros.
-        hitNonZero |= (t.V != 0);
-      }
-      if (!hitNonZero) {
-        ap_int_base<64, false> t(-1ULL);
-        enum { REST = (_AP_W - 1) % 64 };
-        ap_int_base<64, false> x;
-        x.V = _AP_ROOT_op_get_range(this->V, 0, REST);
-        t.V = _AP_ROOT_op_set_range(t.V, 63 - REST, 63, x.V);
-        NZeros += __builtin_clzll(t.V);
-      }
-      return NZeros;
-    }
-#else
-    return (Base::V).countLeadingZeros();
-#endif
-  } // countLeadingZeros
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  concat(const ap_int_base<_AP_W2, _AP_S2>& a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  concat(ap_int_base<_AP_W2, _AP_S2>& a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-      operator,(ap_range_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this), a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) const {
-    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(ap_bit_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, a2);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-                                                                         a2);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      _AP_W, ap_int_base, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                &a2) const {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<
-            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      _AP_W, ap_int_base, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-                                                                       a2);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                    &a2) const {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_int_base, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(
-          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<
-        _AP_W, ap_int_base, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-    return *this & a2.get();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-    return *this | a2.get();
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
-      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-    return *this ^ a2.get();
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
-    Base::V = val.V;
-  }
-
-  /* Reduce operations.
-   * ----------------------------------------------------------------
-   */
-  // XXX non-const version deleted.
-  INLINE bool and_reduce() const { return _AP_ROOT_op_reduce(and, Base::V); }
-  INLINE bool nand_reduce() const { return _AP_ROOT_op_reduce(nand, Base::V); }
-  INLINE bool or_reduce() const { return _AP_ROOT_op_reduce(or, Base::V); }
-  INLINE bool nor_reduce() const { return !(_AP_ROOT_op_reduce(or, Base::V)); }
-  INLINE bool xor_reduce() const { return _AP_ROOT_op_reduce (xor, Base::V); }
-  INLINE bool xnor_reduce() const {
-    return !(_AP_ROOT_op_reduce (xor, Base::V));
-  }
-
-  /* Output as a string.
-   * ----------------------------------------------------------------
-   */
-#ifndef __SYNTHESIS__
-  std::string to_string(signed char rd = 2, bool sign = _AP_S) const {
-    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
-    // initialize sc_lv, which seems incapable of handling format "-0b".
-    if (rd == 2) sign = false;
-    return (Base::V).to_string(rd, sign);
-  }
-#else
-  INLINE char* to_string(signed char rd = 2, bool sign = _AP_S) const {
-    return 0;
-  }
-#endif
-}; // struct ap_int_base
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::ostream& operator<<(std::ostream& os,
-                                const ap_int_base<_AP_W, _AP_S>& x) {
-  std::ios_base::fmtflags ff = std::cout.flags();
-  if (ff & std::cout.hex) {
-    os << x.to_string(16); // don't print sign
-  } else if (ff & std::cout.oct) {
-    os << x.to_string(8); // don't print sign
-  } else {
-    os << x.to_string(10);
-  }
-  return os;
-}
-#endif // ifndef __SYNTHESIS__
-
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::istream& operator>>(std::istream& in,
-                                ap_int_base<_AP_W, _AP_S>& op) {
-  std::string str;
-  in >> str;
-  const std::ios_base::fmtflags basefield = in.flags() & std::ios_base::basefield;
-  unsigned radix = (basefield == std::ios_base::dec) ? 0 : (
-                     (basefield == std::ios_base::oct) ? 8 : (
-                       (basefield == std::ios_base::hex) ? 16 : 0));
-  op = ap_int_base<_AP_W, _AP_S>(str.c_str(), radix);
-  return in;
-}
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_AUTOCC
-
-/* Operators with another ap_int_base.
- * ----------------------------------------------------------------
- */
-#define OP_BIN_AP(Sym, Rty)                                                   \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
-  INLINE                                                                      \
-      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
-      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
-                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base lhs(op);                                  \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base rhs(op2);                                 \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
-    ret.V = lhs.V Sym rhs.V;                                                  \
-    return ret;                                                               \
-  }
-
-OP_BIN_AP(*, mult)
-OP_BIN_AP(+, plus)
-OP_BIN_AP(-, minus)
-OP_BIN_AP(&, logic)
-OP_BIN_AP(|, logic)
-OP_BIN_AP(^, logic)
-
-#define OP_BIN_AP2(Sym, Rty)                                                  \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
-  INLINE                                                                      \
-      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
-      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
-                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
-    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
-        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
-    ret.V = op.V Sym op2.V;                                                   \
-    return ret;                                                               \
-  }
-
-OP_BIN_AP2(/, div)
-OP_BIN_AP2(%, mod)
-
-// shift operators are defined inside class.
-// compound assignment operators are defined inside class.
-
-/* Operators with a pointer type.
- * ----------------------------------------------------------------
- *   char a[100];
- *   char* ptr = a;
- *   ap_int<2> n = 3;
- *   char* ptr2 = ptr + n*2;
- * avoid ambiguous errors.
- */
-#define OP_BIN_WITH_PTR(BIN_OP)                                           \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
-  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                        \
-                                   const ap_int_base<_AP_W, _AP_S>& op) { \
-    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
-    return i_op BIN_OP op2;                                               \
-  }                                                                       \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
-  INLINE PTR_TYPE* operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
-                                   PTR_TYPE* i_op) {                      \
-    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
-    return op2 BIN_OP i_op;                                               \
-  }
-
-OP_BIN_WITH_PTR(+)
-OP_BIN_WITH_PTR(-)
-
-/* Operators with a native floating point types.
- * ----------------------------------------------------------------
- */
-// float OP ap_int
-// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
-#define OP_BIN_WITH_FLOAT(BIN_OP, C_TYPE)                              \
-  template <int _AP_W, bool _AP_S>                                     \
-  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                           \
-                                const ap_int_base<_AP_W, _AP_S>& op) { \
-    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
-    return i_op BIN_OP op2;                                            \
-  }                                                                    \
-  template <int _AP_W, bool _AP_S>                                     \
-  INLINE C_TYPE operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
-                                C_TYPE i_op) {                         \
-    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
-    return op2 BIN_OP i_op;                                            \
-  }
-
-#define ALL_OP_WITH_FLOAT(C_TYPE) \
-  OP_BIN_WITH_FLOAT(*, C_TYPE) \
-  OP_BIN_WITH_FLOAT(/, C_TYPE) \
-  OP_BIN_WITH_FLOAT(+, C_TYPE) \
-  OP_BIN_WITH_FLOAT(-, C_TYPE)
-
-#if _AP_ENABLE_HALF_ == 1
-ALL_OP_WITH_FLOAT(half)
-#endif
-ALL_OP_WITH_FLOAT(float)
-ALL_OP_WITH_FLOAT(double)
-
-// TODO no shift?
-
-/* Operators with a native integral types.
- * ----------------------------------------------------------------
- */
-// arithmetic and bitwise operators.
-#define OP_BIN_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)             \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
-                                                            _AP_S2>::RTYPE \
-  operator BIN_OP(C_TYPE i_op, const ap_int_base<_AP_W, _AP_S>& op) {      \
-    return ap_int_base<_AP_W2, _AP_S2>(i_op) BIN_OP(op);                   \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
-                                                            _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, C_TYPE i_op) {      \
-    return op BIN_OP ap_int_base<_AP_W2, _AP_S2>(i_op);                    \
-  }
-
-#define ALL_OP_BIN_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
-  OP_BIN_WITH_INT(*, C_TYPE, _AP_W2, _AP_S2, mult)  \
-  OP_BIN_WITH_INT(+, C_TYPE, _AP_W2, _AP_S2, plus)  \
-  OP_BIN_WITH_INT(-, C_TYPE, _AP_W2, _AP_S2, minus) \
-  OP_BIN_WITH_INT(/, C_TYPE, _AP_W2, _AP_S2, div)   \
-  OP_BIN_WITH_INT(%, C_TYPE, _AP_W2, _AP_S2, mod)   \
-  OP_BIN_WITH_INT(&, C_TYPE, _AP_W2, _AP_S2, logic) \
-  OP_BIN_WITH_INT(|, C_TYPE, _AP_W2, _AP_S2, logic) \
-  OP_BIN_WITH_INT(^, C_TYPE, _AP_W2, _AP_S2, logic)
-
-ALL_OP_BIN_WITH_INT(bool, 1, false)
-ALL_OP_BIN_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_BIN_WITH_INT(signed char, 8, true)
-ALL_OP_BIN_WITH_INT(unsigned char, 8, false)
-ALL_OP_BIN_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_BIN_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_BIN_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_BIN_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_BIN_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_BIN_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_BIN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_OP_BIN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef OP_BIN_WITH_INT
-#undef ALL_OP_BIN_WITH_INT
-
-// shift operators.
-#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    if (_AP_S2)                                          \
-      r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); \
-    else                                                 \
-      r.V = op.V << op2;                                 \
-    return r;                                            \
-  }                                                      \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    if (_AP_S2)                                          \
-      r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); \
-    else                                                 \
-      r.V = op.V >> op2;                                 \
-    return r;                                            \
-  }
-
-ALL_OP_SHIFT_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_SHIFT_WITH_INT(signed char, 8, true)
-ALL_OP_SHIFT_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_SHIFT_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_SHIFT_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_SHIFT_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-
-#undef ALL_OP_SHIFT_WITH_INT
-
-#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    r.V = op.V << op2;                                   \
-    return r;                                            \
-  }                                                      \
-  template <int _AP_W, bool _AP_S>                       \
-  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
-      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
-    ap_int_base<_AP_W, _AP_S> r;                         \
-    r.V = op.V >> op2;                                   \
-    return r;                                            \
-  }
-ALL_OP_SHIFT_WITH_INT(bool, 1, false)
-ALL_OP_SHIFT_WITH_INT(unsigned char, 8, false)
-ALL_OP_SHIFT_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_SHIFT_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_SHIFT_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_SHIFT_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef ALL_OP_SHIFT_WITH_INT
-
-// compound assign operators.
-#define OP_ASSIGN_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE ap_int_base<_AP_W, _AP_S>& operator ASSIGN_OP(             \
-      ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
-    return op ASSIGN_OP ap_int_base<_AP_W2, _AP_S2>(op2);           \
-  }
-
-// TODO int a; ap_int<16> b; a += b;
-
-#define ALL_OP_ASSIGN_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
-  OP_ASSIGN_WITH_INT(+=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(-=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(*=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(/=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(%=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(&=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(|=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(^=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_ASSIGN_WITH_INT(>>=, C_TYPE, _AP_W2, _AP_S2)      \
-  OP_ASSIGN_WITH_INT(<<=, C_TYPE, _AP_W2, _AP_S2)
-
-ALL_OP_ASSIGN_WITH_INT(bool, 1, false)
-ALL_OP_ASSIGN_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_ASSIGN_WITH_INT(signed char, 8, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned char, 8, false)
-ALL_OP_ASSIGN_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_ASSIGN_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_ASSIGN_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_ASSIGN_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_ASSIGN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_OP_ASSIGN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef OP_ASSIGN_WITH_INT
-#undef ALL_OP_ASSIGN_WITH_INT
-
-// equality and relational operators.
-#define OP_REL_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)              \
-  template <int _AP_W, bool _AP_S>                                   \
-  INLINE bool operator REL_OP(C_TYPE i_op,                           \
-                              const ap_int_base<_AP_W, _AP_S>& op) { \
-    return ap_int_base<_AP_W2, _AP_S2>(i_op) REL_OP op;              \
-  }                                                                  \
-  template <int _AP_W, bool _AP_S>                                   \
-  INLINE bool operator REL_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
-                              C_TYPE op2) {                          \
-    return op REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);               \
-  }
-
-#define ALL_OP_REL_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
-  OP_REL_WITH_INT(>, C_TYPE, _AP_W2, _AP_S2)        \
-  OP_REL_WITH_INT(<, C_TYPE, _AP_W2, _AP_S2)        \
-  OP_REL_WITH_INT(>=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_REL_WITH_INT(<=, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_REL_WITH_INT(==, C_TYPE, _AP_W2, _AP_S2)       \
-  OP_REL_WITH_INT(!=, C_TYPE, _AP_W2, _AP_S2)
-
-ALL_OP_REL_WITH_INT(bool, 1, false)
-ALL_OP_REL_WITH_INT(char, 8, CHAR_IS_SIGNED)
-ALL_OP_REL_WITH_INT(signed char, 8, true)
-ALL_OP_REL_WITH_INT(unsigned char, 8, false)
-ALL_OP_REL_WITH_INT(short, _AP_SIZE_short, true)
-ALL_OP_REL_WITH_INT(unsigned short, _AP_SIZE_short, false)
-ALL_OP_REL_WITH_INT(int, _AP_SIZE_int, true)
-ALL_OP_REL_WITH_INT(unsigned int, _AP_SIZE_int, false)
-ALL_OP_REL_WITH_INT(long, _AP_SIZE_long, true)
-ALL_OP_REL_WITH_INT(unsigned long, _AP_SIZE_long, false)
-ALL_OP_REL_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-ALL_OP_REL_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef OP_REL_WITH_INT
-#undef ALL_OP_BIN_WITH_INT
-
-#define OP_REL_WITH_DOUBLE_OR_FLOAT(Sym)                            \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
-                           double op2) {                            \
-    return op1.to_double() Sym op2 ;                                \
-  }                                                                 \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(double op1,                              \
-                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
-    return op1 Sym op2.to_double() ;                                \
-  }                                                                 \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
-                           float op2) {                             \
-    return op1.to_double() Sym op2 ;                                \
-  }                                                                 \
-  template <int _AP_W, bool _AP_S>                                  \
-  INLINE bool operator Sym(float op1,                               \
-                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
-    return op1 Sym op2.to_double() ;                                \
-  }
-  OP_REL_WITH_DOUBLE_OR_FLOAT(>)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(<)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(>=)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(<=)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(==)
-  OP_REL_WITH_DOUBLE_OR_FLOAT(!=)
-
-#undef OP_REL_WITH_DOUBLE_OR_FLOAT
-
-
-/* Operators with ap_bit_ref.
- * ------------------------------------------------------------
- */
-// arithmetic, bitwise and shift operators.
-#define OP_BIN_WITH_RANGE(BIN_OP, RTYPE)                                     \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                              _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,                   \
-                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
-    return ap_int_base<_AP_W1, false>(op1) BIN_OP op2;                       \
-  }                                                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                              _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                    \
-                  const ap_range_ref<_AP_W2, _AP_S2>& op2) {                 \
-    return op1 BIN_OP ap_int_base<_AP_W2, false>(op2);                       \
-  }
-
-OP_BIN_WITH_RANGE(+, plus)
-OP_BIN_WITH_RANGE(-, minus)
-OP_BIN_WITH_RANGE(*, mult)
-OP_BIN_WITH_RANGE(/, div)
-OP_BIN_WITH_RANGE(%, mod)
-OP_BIN_WITH_RANGE(&, logic)
-OP_BIN_WITH_RANGE(|, logic)
-OP_BIN_WITH_RANGE(^, logic)
-OP_BIN_WITH_RANGE(>>, arg1)
-OP_BIN_WITH_RANGE(<<, arg1)
-
-#undef OP_BIN_WITH_RANGE
-
-// compound assignment operators.
-#define OP_ASSIGN_WITH_RANGE(ASSIGN_OP)                                      \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                    \
-      ap_int_base<_AP_W1, _AP_S1>& op1, ap_range_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1 ASSIGN_OP ap_int_base<_AP_W2, false>(op2);                    \
-  }                                                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE ap_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
-      ap_range_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    ap_int_base<_AP_W1, false> tmp(op1);                                     \
-    tmp ASSIGN_OP op2;                                                       \
-    op1 = tmp;                                                               \
-    return op1;                                                              \
-  }
-
-OP_ASSIGN_WITH_RANGE(+=)
-OP_ASSIGN_WITH_RANGE(-=)
-OP_ASSIGN_WITH_RANGE(*=)
-OP_ASSIGN_WITH_RANGE(/=)
-OP_ASSIGN_WITH_RANGE(%=)
-OP_ASSIGN_WITH_RANGE(&=)
-OP_ASSIGN_WITH_RANGE(|=)
-OP_ASSIGN_WITH_RANGE(^=)
-OP_ASSIGN_WITH_RANGE(>>=)
-OP_ASSIGN_WITH_RANGE(<<=)
-
-#undef OP_ASSIGN_WITH_RANGE
-
-// equality and relational operators
-#define OP_REL_WITH_RANGE(REL_OP)                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE bool operator REL_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,     \
-                              const ap_int_base<_AP_W2, _AP_S2>& op2) {    \
-    return ap_int_base<_AP_W1, false>(op1).operator REL_OP(op2);           \
-  }                                                                        \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,      \
-                              const ap_range_ref<_AP_W2, _AP_S2>& op2) {   \
-    return op1.operator REL_OP(op2.operator ap_int_base<_AP_W2, false>()); \
-  }
-
-OP_REL_WITH_RANGE(==)
-OP_REL_WITH_RANGE(!=)
-OP_REL_WITH_RANGE(>)
-OP_REL_WITH_RANGE(>=)
-OP_REL_WITH_RANGE(<)
-OP_REL_WITH_RANGE(<=)
-
-#undef OP_REL_WITH_RANGE
-
-/* Operators with ap_bit_ref.
- * ------------------------------------------------------------
- */
-// arithmetic, bitwise and shift operators.
-#define OP_BIN_WITH_BIT(BIN_OP, RTYPE)                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
-  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                      \
-                  const ap_bit_ref<_AP_W2, _AP_S2>& op2) {                     \
-    return op1 BIN_OP ap_int_base<1, false>(op2);                              \
-  }                                                                            \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,                       \
-                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                    \
-    return ap_int_base<1, false>(op1) BIN_OP op2;                              \
-  }
-
-OP_BIN_WITH_BIT(+, plus)
-OP_BIN_WITH_BIT(-, minus)
-OP_BIN_WITH_BIT(*, mult)
-OP_BIN_WITH_BIT(/, div)
-OP_BIN_WITH_BIT(%, mod)
-OP_BIN_WITH_BIT(&, logic)
-OP_BIN_WITH_BIT(|, logic)
-OP_BIN_WITH_BIT(^, logic)
-OP_BIN_WITH_BIT(>>, arg1)
-OP_BIN_WITH_BIT(<<, arg1)
-
-#undef OP_BIN_WITH_BIT
-
-// compound assignment operators.
-#define OP_ASSIGN_WITH_BIT(ASSIGN_OP)                                      \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                  \
-      ap_int_base<_AP_W1, _AP_S1>& op1, ap_bit_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1 ASSIGN_OP ap_int_base<1, false>(op2);                       \
-  }                                                                        \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
-  INLINE ap_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
-      ap_bit_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    ap_int_base<1, false> tmp(op1);                                        \
-    tmp ASSIGN_OP op2;                                                     \
-    op1 = tmp;                                                             \
-    return op1;                                                            \
-  }
-
-OP_ASSIGN_WITH_BIT(+=)
-OP_ASSIGN_WITH_BIT(-=)
-OP_ASSIGN_WITH_BIT(*=)
-OP_ASSIGN_WITH_BIT(/=)
-OP_ASSIGN_WITH_BIT(%=)
-OP_ASSIGN_WITH_BIT(&=)
-OP_ASSIGN_WITH_BIT(|=)
-OP_ASSIGN_WITH_BIT(^=)
-OP_ASSIGN_WITH_BIT(>>=)
-OP_ASSIGN_WITH_BIT(<<=)
-
-#undef OP_ASSIGN_WITH_BIT
-
-// equality and relational operators.
-#define OP_REL_WITH_BIT(REL_OP)                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
-  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,   \
-                              const ap_bit_ref<_AP_W2, _AP_S2>& op2) {  \
-    return op1 REL_OP ap_int_base<1, false>(op2);                       \
-  }                                                                     \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
-  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,    \
-                              const ap_int_base<_AP_W2, _AP_S2>& op2) { \
-    return ap_int_base<1, false>(op1) REL_OP op2;                       \
-  }
-
-OP_REL_WITH_BIT(==)
-OP_REL_WITH_BIT(!=)
-OP_REL_WITH_BIT(>)
-OP_REL_WITH_BIT(>=)
-OP_REL_WITH_BIT(<)
-OP_REL_WITH_BIT(<=)
-
-#undef OP_REL_WITH_BIT
-
-
-/* Operators with ap_concat_ref.
- * ------------------------------------------------------------
- */
-// arithmetic, bitwise and shift operators.
-// bitwise operators are defined in struct.
-// TODO specify whether to define arithmetic and bitwise operators.
-#if 0
-#define OP_BIN_WITH_CONCAT(BIN_OP, RTYPE)                                      \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
-                                                              false>::RTYPE    \
-  operator BIN_OP(const ap_int_base<_AP_W3, _AP_S3>& op1,                      \
-                  const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {  \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    return op1 BIN_OP op2.get();                                               \
-  }                                                                            \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
-                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
-  operator BIN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,    \
-                  const ap_int_base<_AP_W3, _AP_S3>& op2) {                    \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    return op1.get() BIN_OP op2;                                               \
-  }
-
-OP_BIN_WITH_CONCAT(+, plus)
-OP_BIN_WITH_CONCAT(-, minus)
-OP_BIN_WITH_CONCAT(*, mult)
-OP_BIN_WITH_CONCAT(/, div)
-OP_BIN_WITH_CONCAT(%, mod)
-OP_BIN_WITH_CONCAT(&, logic)
-OP_BIN_WITH_CONCAT(|, logic)
-OP_BIN_WITH_CONCAT(^, logic)
-OP_BIN_WITH_CONCAT(>>, arg1)
-OP_BIN_WITH_CONCAT(<<, arg1)
-
-#undef OP_BIN_WITH_CONCAT
-
-// compound assignment operators.
-#define OP_ASSIGN_WITH_CONCAT(ASSIGN_OP)                                       \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
-                                                              false>::RTYPE    \
-  operator ASSIGN_OP(                                                          \
-      const ap_int_base<_AP_W3, _AP_S3>& op1,                                  \
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {              \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    return op1 ASSIGN_OP op2.get();                                            \
-  }                                                                            \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
-            int _AP_W3, bool _AP_S3>                                           \
-  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
-                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
-  operator ASSIGN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \
-                     const ap_int_base<_AP_W3, _AP_S3>& op2) {                 \
-    /* convert ap_concat_ref to ap_int_base */                                 \
-    ap_int_base<_AP_W1 + _AP_W2, false> tmp = op1.get();                       \
-    tmp ASSIGN_OP op2;                                                         \
-    op1 = tmp;                                                                 \
-    return op1;                                                                \
-  }
-
-OP_ASSIGN_WITH_CONCAT(+=)
-OP_ASSIGN_WITH_CONCAT(-=)
-OP_ASSIGN_WITH_CONCAT(*=)
-OP_ASSIGN_WITH_CONCAT(/=)
-OP_ASSIGN_WITH_CONCAT(%=)
-OP_ASSIGN_WITH_CONCAT(&=)
-OP_ASSIGN_WITH_CONCAT(|=)
-OP_ASSIGN_WITH_CONCAT(^=)
-OP_ASSIGN_WITH_CONCAT(>>=)
-OP_ASSIGN_WITH_CONCAT(<<=)
-
-#undef OP_ASSIGN_WITH_CONCAT
-#endif
-
-// equality and relational operators.
-#define OP_REL_WITH_CONCAT(REL_OP)                                    \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
-            int _AP_W3, bool _AP_S3>                                  \
-  INLINE bool operator REL_OP(                                        \
-      const ap_int_base<_AP_W3, _AP_S3>& op1,                         \
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {     \
-    /* convert ap_concat_ref to ap_int_base */                        \
-    return op1 REL_OP op2.get();                                      \
-  }                                                                   \
-  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
-            int _AP_W3, bool _AP_S3>                                  \
-  INLINE bool operator REL_OP(                                        \
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,       \
-      const ap_int_base<_AP_W3, _AP_S3>& op2) {                       \
-    /* convert ap_concat_ref to ap_int_base */                        \
-    return op1.get() REL_OP op2;                                      \
-  }
-
-OP_REL_WITH_CONCAT(==)
-OP_REL_WITH_CONCAT(!=)
-OP_REL_WITH_CONCAT(>)
-OP_REL_WITH_CONCAT(>=)
-OP_REL_WITH_CONCAT(<)
-OP_REL_WITH_CONCAT(<=)
-
-#undef OP_REL_WITH_CONCAT
-
-#endif // ifndef __cplusplus
-#endif // ifndef __AP_INT_BASE_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_BASE_H__
+#define __AP_INT_BASE_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+#else
+
+#include <ap_common.h>
+#ifndef __SYNTHESIS__
+#if _AP_ENABLE_HALF_ == 1
+#include <hls_half.h>
+#endif
+#include <iostream>
+#include <string.h>
+#endif
+
+/* ----------------------------------------------------------------
+ * ap_int_base: AutoPilot integer/Arbitrary precision integer.
+ * ----------------------------------------------------------------
+ */
+
+/* helper trait. Selecting the smallest C type that can hold the value,
+ * return 64 bit C type if not possible.
+ */
+template <int _AP_N, bool _AP_S>
+struct retval;
+
+// at least 64 bit
+template <int _AP_N>
+struct retval<_AP_N, true> {
+  typedef ap_slong Type;
+};
+
+template <int _AP_N>
+struct retval<_AP_N, false> {
+  typedef ap_ulong Type;
+};
+
+// at least 8 bit
+template <>
+struct retval<1, true> {
+  typedef signed char Type;
+};
+
+template <>
+struct retval<1, false> {
+  typedef unsigned char Type;
+};
+
+// at least 16 bit
+template <>
+struct retval<2, true> {
+  typedef short Type;
+};
+
+template <>
+struct retval<2, false> {
+  typedef unsigned short Type;
+};
+
+// at least 32 bit
+template <>
+struct retval<3, true> {
+  typedef long Type;
+};
+
+template <>
+struct retval<3, false> {
+  typedef unsigned long Type;
+};
+
+template <>
+struct retval<4, true> {
+  typedef long Type;
+};
+
+template <>
+struct retval<4, false> {
+  typedef unsigned long Type;
+};
+
+// trait for letting base class to return derived class.
+// Notice that derived class template is incomplete, and we cannot use
+// the member of the derived class.
+template <int _AP_W2, bool _AP_S2>
+struct _ap_int_factory;
+template <int _AP_W2>
+struct _ap_int_factory<_AP_W2,true> { typedef ap_int<_AP_W2> type; };
+template <int _AP_W2>
+struct _ap_int_factory<_AP_W2,false> { typedef ap_uint<_AP_W2> type; };
+
+template <int _AP_W, bool _AP_S>
+struct ap_int_base : public _AP_ROOT_TYPE<_AP_W, _AP_S> {
+ public:
+  typedef _AP_ROOT_TYPE<_AP_W, _AP_S> Base;
+
+  /* ap_int_base<_AP_W, _AP_S, true>
+   * typedef typename retval<(_AP_W + 7) / 8, _AP_S>::Type RetType;
+   *
+   * ap_int_base<_AP_W, _AP_S, false>
+   * typedef typename retval<8, _AP_S>::Type RetType;
+   */
+  typedef typename retval<AP_MAX((_AP_W + 7) / 8, 8), _AP_S>::Type RetType;
+
+  static const int width = _AP_W;
+
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+
+
+    typedef ap_int_base<mult_w, mult_s> mult_base;
+    typedef ap_int_base<plus_w, plus_s> plus_base;
+    typedef ap_int_base<minus_w, minus_s> minus_base;
+    typedef ap_int_base<logic_w, logic_s> logic_base;
+    typedef ap_int_base<div_w, div_s> div_base;
+    typedef ap_int_base<mod_w, mod_s> mod_base;
+    typedef ap_int_base<_AP_W, _AP_S> arg1_base;
+
+    typedef typename _ap_int_factory<mult_w, mult_s>::type mult;
+    typedef typename _ap_int_factory<plus_w, plus_s>::type plus;
+    typedef typename _ap_int_factory<minus_w, minus_s>::type minus;
+    typedef typename _ap_int_factory<logic_w, logic_s>::type logic;
+    typedef typename _ap_int_factory<div_w, div_s>::type div;
+    typedef typename _ap_int_factory<mod_w, mod_s>::type mod;
+    typedef typename _ap_int_factory<_AP_W, _AP_S>::type arg1;
+    typedef bool reduce;
+  };
+
+  /* Constructors.
+   * ----------------------------------------------------------------
+   */
+  /// default ctor
+  INLINE ap_int_base() {
+    /*
+      #ifdef __SC_COMPATIBLE__
+      Base::V = 0;
+      #endif
+    */
+  }
+
+  /// copy ctor
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+  /// volatile copy ctor
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const volatile ap_int_base<_AP_W2, _AP_S2>& op) {
+    Base::V = op.V;
+  }
+
+// XXX C++11 feature.
+// The explicit specifier specifies that a constructor or conversion function
+// (since C++11) doesn't allow implicit conversions or copy-initialization.
+//   ap_int_base<W,S> x = 1;
+//   ap_int_base<W,S> foo() { return 1; }
+// but allows
+//   ap_int_base<W,S> x(1);
+//   ap_int_base<W,S> y {1};
+
+/// from all c types.
+#define CTOR_FROM_INT(Type, Size, Signed) \
+  INLINE ap_int_base(const Type op) { Base::V = op; }
+
+  CTOR_FROM_INT(bool, 1, false)
+  CTOR_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  CTOR_FROM_INT(signed char, 8, true)
+  CTOR_FROM_INT(unsigned char, 8, false)
+  CTOR_FROM_INT(short, _AP_SIZE_short, true)
+  CTOR_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  CTOR_FROM_INT(int, _AP_SIZE_int, true)
+  CTOR_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  CTOR_FROM_INT(long, _AP_SIZE_long, true)
+  CTOR_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  CTOR_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  CTOR_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+#undef CTOR_FROM_INT
+
+#if _AP_ENABLE_HALF_ == 1
+  /// ctor from half.
+  //  TODO optimize
+  INLINE ap_int_base(half op) {
+    ap_int_base<_AP_W, _AP_S> t((float)op);
+    Base::V = t.V;
+  }
+#endif
+
+  /// ctor from float.
+  INLINE ap_int_base(float op) {
+    const int BITS = FLOAT_MAN + FLOAT_EXP + 1;
+    ap_int_base<BITS, false> reg;
+    reg.V = floatToRawBits(op);
+    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
+
+    ap_int_base<FLOAT_EXP + 1, true> exp = 0;
+    exp.V = _AP_ROOT_op_get_range(reg.V, FLOAT_MAN, BITS - 2);
+    exp = exp - FLOAT_BIAS;
+
+    ap_int_base<FLOAT_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(reg.V, 0, FLOAT_MAN - 1);
+    // check for NaN
+    _AP_WARNING(exp == ((unsigned char)(FLOAT_BIAS + 1)) && man.V != 0,
+                "assign NaN to ap integer value");
+    // set leading 1.
+    man.V = _AP_ROOT_op_set_bit(man.V, FLOAT_MAN, 1);
+    //if (is_neg) man = -man;
+
+    if ((reg.V & 0x7ffffffful) == 0) {
+      Base::V = 0;
+    } else {
+      int sh_amt = FLOAT_MAN - exp.V;
+      if (sh_amt == 0) {
+        Base::V = man.V;
+      } else if (sh_amt > 0) {
+        if (sh_amt < FLOAT_MAN + 2) {
+          Base::V = man.V >> sh_amt;
+        } else {
+          if (is_neg)
+            Base::V = -1;
+          else
+            Base::V = 0;
+        }
+      } else {
+        sh_amt = -sh_amt;
+        if (sh_amt < _AP_W) {
+          Base::V = man.V;
+          Base::V <<= sh_amt;
+        } else {
+          Base::V = 0;
+        }
+      }
+    }
+    if (is_neg) *this = -(*this);
+  }
+
+  /// ctor from double.
+  INLINE ap_int_base(double op) {
+    const int BITS = DOUBLE_MAN + DOUBLE_EXP + 1;
+    ap_int_base<BITS, false> reg;
+    reg.V = doubleToRawBits(op);
+    bool is_neg = _AP_ROOT_op_get_bit(reg.V, BITS - 1);
+
+    ap_int_base<DOUBLE_EXP + 1, true> exp = 0;
+    exp.V = _AP_ROOT_op_get_range(reg.V, DOUBLE_MAN, BITS - 2);
+    exp = exp - DOUBLE_BIAS;
+
+    ap_int_base<DOUBLE_MAN + 2, true> man;
+    man.V = _AP_ROOT_op_get_range(reg.V, 0, DOUBLE_MAN - 1);
+    // check for NaN
+    _AP_WARNING(exp == ((unsigned char)(DOUBLE_BIAS + 1)) && man.V != 0,
+                "assign NaN to ap integer value");
+    // set leading 1.
+    man.V = _AP_ROOT_op_set_bit(man.V, DOUBLE_MAN, 1);
+    //if (is_neg) man = -man;
+
+    if ((reg.V & 0x7fffffffffffffffull) == 0) {
+      Base::V = 0;
+    } else {
+      int sh_amt = DOUBLE_MAN - exp.V;
+      if (sh_amt == 0) {
+        Base::V = man.V;
+      } else if (sh_amt > 0) {
+        if (sh_amt < DOUBLE_MAN + 2) {
+          Base::V = man.V >> sh_amt;
+        } else {
+          if (is_neg)
+            Base::V = -1;
+          else
+            Base::V = 0;
+        }
+      } else {
+        sh_amt = -sh_amt;
+        if (sh_amt < _AP_W) {
+          Base::V = man.V;
+          Base::V <<= sh_amt;
+        } else {
+          Base::V = 0;
+        }
+      }
+    }
+    if (is_neg) *this = -(*this);
+  }
+
+  /// from higer rank type.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = op.to_ap_int_base().V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_range_ref<_AP_W2, _AP_S2>& ref) {
+    Base::V = (ref.get()).V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base(const ap_bit_ref<_AP_W2, _AP_S2>& ref) {
+    Base::V = ref.operator bool();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+    const ap_int_base<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>::_AP_WR,
+                      false>
+        tmp = ref.get();
+    Base::V = tmp.V;
+  }
+
+  /* radix has default value in set */
+
+#ifndef __SYNTHESIS__
+  INLINE ap_int_base(const char* s, signed char rd = 0) {
+    if (rd == 0)
+      rd = guess_radix(s);
+    unsigned int length = strlen(s);
+    Base::V.fromString(s, length, rd);
+  }
+#else
+  // XXX __builtin_bit_from_string(...) requires const C string and radix.
+  INLINE ap_int_base(const char* s) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), 10, _AP_W, _AP_S,
+                      AP_TRN, AP_WRAP, 0, _AP_C99);
+    Base::V = t;
+  }
+  INLINE ap_int_base(const char* s, signed char rd) {
+    typeof(Base::V) t;
+    _ssdm_string2bits((void*)(&t), (const char*)(s), rd, _AP_W, _AP_S,
+                      AP_TRN, AP_WRAP, 0, _AP_C99);
+    Base::V = t;
+  }
+#endif
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    Base::V = (val.get()).V;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    Base::V = val.operator bool();
+  }
+
+  INLINE ap_int_base read() volatile {
+    /*AP_DEBUG(printf("call read %d\n", Base::V););*/
+    ap_int_base ret;
+    ret.V = Base::V;
+    return ret;
+  }
+
+  INLINE void write(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    /*AP_DEBUG(printf("call write %d\n", op2.V););*/
+    Base::V = op2.V;
+  }
+
+  /* Another form of "write".*/
+  template <int _AP_W2, bool _AP_S2>
+  INLINE void operator=(
+      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  INLINE void operator=(
+      const volatile ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE void operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  INLINE void operator=(const ap_int_base<_AP_W, _AP_S>& op2) volatile {
+    Base::V = op2.V;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(
+      const volatile ap_int_base<_AP_W2, _AP_S2>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  INLINE ap_int_base& operator=(const volatile ap_int_base<_AP_W, _AP_S>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+  INLINE ap_int_base& operator=(const ap_int_base<_AP_W, _AP_S>& op2) {
+    Base::V = op2.V;
+    return *this;
+  }
+
+
+#define ASSIGN_OP_FROM_INT(Type, Size, Signed) \
+  INLINE ap_int_base& operator=(Type op) {     \
+    Base::V = op;                              \
+    return *this;                              \
+  }
+
+  ASSIGN_OP_FROM_INT(bool, 1, false)
+  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  ASSIGN_OP_FROM_INT(signed char, 8, true)
+  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
+  ASSIGN_OP_FROM_INT(short, _AP_SIZE_short, true)
+  ASSIGN_OP_FROM_INT(unsigned short, _AP_SIZE_short, false)
+  ASSIGN_OP_FROM_INT(int, _AP_SIZE_int, true)
+  ASSIGN_OP_FROM_INT(unsigned int, _AP_SIZE_int, false)
+  ASSIGN_OP_FROM_INT(long, _AP_SIZE_long, true)
+  ASSIGN_OP_FROM_INT(unsigned long, _AP_SIZE_long, false)
+  ASSIGN_OP_FROM_INT(ap_slong, _AP_SIZE_ap_slong, true)
+  ASSIGN_OP_FROM_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ASSIGN_OP_FROM_INT
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& op2) {
+    Base::V = (bool)op2;
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    Base::V = (ap_int_base<_AP_W2, false>(op2)).V;
+    return *this;
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& op2) {
+    Base::V = op2.get().V;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = op.to_ap_int_base().V;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = (bool)op;
+    return *this;
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_int_base& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& op) {
+    Base::V = ((const ap_int_base<_AP_W2, false>)(op)).V;
+    return *this;
+  }
+
+  // FIXME: UG902 has clearly required user to use to_int() to convert to built-in
+  // types, but this implicit conversion is relied on in hls_cordic.h and hls_rsr.h.
+  // For example:
+  //     int d_exp = fps_x.exp - fps_y.exp;
+  INLINE operator RetType() const { return (RetType)(Base::V); }
+
+  /* Explicit conversions to C types.
+   * ----------------------------------------------------------------
+   */
+  INLINE bool to_bool() const { return (bool)(Base::V); }
+  INLINE char to_char() const { return (char)(Base::V); }
+  INLINE signed char to_schar() const { return (signed char)(Base::V); }
+  INLINE unsigned char to_uchar() const { return (unsigned char)(Base::V); }
+  INLINE short to_short() const { return (short)(Base::V); }
+  INLINE unsigned short to_ushort() const { return (unsigned short)(Base::V); }
+  INLINE int to_int() const { return (int)(Base::V); }
+  INLINE unsigned to_uint() const { return (unsigned)(Base::V); }
+  INLINE long to_long() const { return (long)(Base::V); }
+  INLINE unsigned long to_ulong() const { return (unsigned long)(Base::V); }
+  INLINE ap_slong to_int64() const { return (ap_slong)(Base::V); }
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)(Base::V); }
+  INLINE float to_float() const { return (float)(Base::V); }
+  INLINE double to_double() const { return (double)(Base::V); }
+
+  // TODO decide if user-defined conversion should be provided.
+#if 0
+  INLINE operator char() const { return (char)(Base::V); }
+  INLINE operator signed char() const { return (signed char)(Base::V); }
+  INLINE operator unsigned char() const { return (unsigned char)(Base::V); }
+  INLINE operator short() const { return (short)(Base::V); }
+  INLINE operator unsigned short() const { return (unsigned short)(Base::V); }
+  INLINE operator int() const { return (int)(Base::V); }
+  INLINE operator unsigned int () const { return (unsigned)(Base::V); }
+  INLINE operator long () const { return (long)(Base::V); }
+  INLINE operator unsigned long () const { return (unsigned long)(Base::V); }
+  INLINE operator ap_slong () { return (ap_slong)(Base::V); }
+  INLINE operator ap_ulong () { return (ap_ulong)(Base::V); }
+#endif
+
+  /* Helper methods.
+     ----------------------------------------------------------------
+  */
+  /* we cannot call a non-volatile function on a volatile instance.
+   * but calling a volatile function is ok.
+   * XXX deleted non-volatile version.
+   */
+  INLINE int length() const volatile { return _AP_W; }
+
+  /*Return true if the value of ap_int_base instance is zero*/
+  INLINE bool iszero() const { return Base::V == 0; }
+
+  /*Return true if the value of ap_int_base instance is zero*/
+  INLINE bool is_zero() const { return Base::V == 0; }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (_AP_S &&
+        _AP_ROOT_op_get_bit(Base::V, _AP_W - 1))
+      return true;
+    else
+      return false;
+  }
+
+  /* x[i] = 0 */
+  INLINE void clear(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
+  }
+
+  /* x[i] = !x[i]*/
+  INLINE void invert(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    bool val = _AP_ROOT_op_get_bit(Base::V, i);
+    if (val)
+      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 0);
+    else
+      Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
+  }
+
+  INLINE bool test(int i) const {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    return _AP_ROOT_op_get_bit(Base::V, i);
+  }
+
+  // Get self. For ap_concat_ref expansion.
+  INLINE ap_int_base& get() { return *this; }
+
+  // Set the ith bit into 1
+  INLINE void set(int i) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, 1);
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    AP_ASSERT(i >= 0 && i < _AP_W, "position out of range");
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_int_base object n places to the left
+  INLINE ap_int_base& lrotate(int n) {
+    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
+    // TODO unify this.
+#ifdef __SYNTHESIS__
+    typeof(Base::V) l_p = Base::V << n;
+    typeof(Base::V) r_p = Base::V >> (_AP_W - n);
+    Base::V = l_p | r_p;
+#else
+    Base::V.lrotate(n);
+#endif
+    return *this;
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_int_base object n places to the right
+  INLINE ap_int_base& rrotate(int n) {
+    AP_ASSERT(n >= 0 && n < _AP_W, "shift value out of range");
+    // TODO unify this.
+#ifdef __SYNTHESIS__
+    typeof(Base::V) l_p = Base::V << (_AP_W - n);
+    typeof(Base::V) r_p = Base::V >> n;
+    Base::V = l_p | r_p;
+#else
+    Base::V.rrotate(n);
+#endif
+    return *this;
+  }
+
+  // Reverse the contents of ap_int_base instance.
+  // I.e. LSB becomes MSB and vise versa.
+  INLINE ap_int_base& reverse() {
+    Base::V = _AP_ROOT_op_get_range(Base::V, _AP_W - 1, 0);
+    return *this;
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    Base::V = _AP_ROOT_op_set_bit(Base::V, i, v);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    return (bool)_AP_ROOT_op_get_bit(Base::V, i);
+  }
+
+  // complements every bit
+  INLINE void b_not() { Base::V = ~Base::V; }
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    Base::V Sym op2.V;                                                       \
+    return *this;                                                            \
+  }
+
+  /* Arithmetic assign.
+   * ----------------------------------------------------------------
+   */
+  OP_ASSIGN_AP(*=)
+  OP_ASSIGN_AP(+=)
+  OP_ASSIGN_AP(-=)
+  OP_ASSIGN_AP(/=)
+  OP_ASSIGN_AP(%=)
+#undef OP_ASSIGN_AP
+
+  /* Bitwise assign: and, or, xor.
+   * ----------------------------------------------------------------
+   */
+#define OP_ASSIGN_AP_CHK(Sym)                                                \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_int_base& operator Sym(const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    _AP_WARNING((_AP_W != _AP_W2),                                           \
+                "Bitsize mismatch for ap_[u]int" #Sym "ap_[u]int.");         \
+    Base::V Sym op2.V;                                                       \
+    return *this;                                                            \
+  }
+  OP_ASSIGN_AP_CHK(&=)
+  OP_ASSIGN_AP_CHK(|=)
+  OP_ASSIGN_AP_CHK(^=)
+#undef OP_ASSIGN_AP_CHK
+
+  /* Prefix increment, decrement.
+   * ----------------------------------------------------------------
+   */
+  INLINE ap_int_base& operator++() {
+    operator+=((ap_int_base<1, false>)1);
+    return *this;
+  }
+  INLINE ap_int_base& operator--() {
+    operator-=((ap_int_base<1, false>)1);
+    return *this;
+  }
+
+  /* Postfix increment, decrement
+   * ----------------------------------------------------------------
+   */
+  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator++(int) {
+    ap_int_base t = *this;
+    operator+=((ap_int_base<1, false>)1);
+    return t;
+  }
+  INLINE const typename RType<_AP_W,_AP_S>::arg1 operator--(int) {
+    ap_int_base t = *this;
+    operator-=((ap_int_base<1, false>)1);
+    return t;
+  }
+
+  /* Unary arithmetic.
+   * ----------------------------------------------------------------
+   */
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator+() const { return *this; }
+
+  // TODO used to be W>64 only... need check.
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_int_base<1, false>(0) - *this;
+  }
+
+  /* Not (!)
+   * ----------------------------------------------------------------
+   */
+  INLINE bool operator!() const { return Base::V == 0; }
+
+  /* Bitwise (arithmetic) unary: complement
+     ----------------------------------------------------------------
+  */
+  // XXX different from Mentor's ac_int!
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator~() const {
+    ap_int_base<_AP_W, _AP_S> r;
+    r.V = ~Base::V;
+    return r;
+  }
+
+  /* Shift (result constrained by left operand).
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, true>& op2) const {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator>>(sh);
+    } else
+      return operator<<(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator<<(const ap_int_base<_AP_W2, false>& op2) const {
+    ap_int_base r;
+    r.V = Base::V << op2.to_uint();
+    return r;
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, true>& op2) const {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator<<(sh);
+    }
+    return operator>>(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE typename RType<_AP_W,_AP_S>::arg1 operator>>(const ap_int_base<_AP_W2, false>& op2) const {
+    ap_int_base r;
+    r.V = Base::V >> op2.to_uint();
+    return r;
+  }
+
+  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
+#if 0
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base operator<<(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
+    return *this << (op2.operator ap_int_base<_AP_W2, false>());
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base operator>>(const ap_range_ref<_AP_W2, _AP_S2>& op2) const {
+    return *this >> (op2.operator ap_int_base<_AP_W2, false>());
+  }
+#endif
+
+  /* Shift assign
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2>
+  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, true>& op2) {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator>>=(sh);
+    } else
+      return operator<<=(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator<<=(const ap_int_base<_AP_W2, false>& op2) {
+    Base::V <<= op2.to_uint();
+    return *this;
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, true>& op2) {
+    bool isNeg = _AP_ROOT_op_get_bit(op2.V, _AP_W2 - 1);
+    ap_int_base<_AP_W2, false> sh = op2;
+    if (isNeg) {
+      sh = -op2;
+      return operator<<=(sh);
+    }
+    return operator>>=(sh);
+  }
+
+  template <int _AP_W2>
+  INLINE ap_int_base& operator>>=(const ap_int_base<_AP_W2, false>& op2) {
+    Base::V >>= op2.to_uint();
+    return *this;
+  }
+
+  // FIXME we standalone operator>> for ap_int_base and ap_range_ref.
+#if 0
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator<<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return *this <<= (op2.operator ap_int_base<_AP_W2, false>());
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_int_base& operator>>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return *this >>= (op2.operator ap_int_base<_AP_W2, false>());
+  }
+#endif
+
+  /* Equality and Relational.
+   * ----------------------------------------------------------------
+   */
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V == op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return !(Base::V == op2.V);
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V < op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V >= op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V > op2.V;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_int_base<_AP_W2, _AP_S2>& op2) const {
+    return Base::V <= op2.V;
+  }
+
+  /* Bit and Part Select
+   * ----------------------------------------------------------------
+   */
+  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
+    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
+    return ap_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  // This is a must to strip constness to produce reference type.
+  INLINE ap_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    _AP_ERROR(Hi >= _AP_W, "Hi(%d)out of bound(%d) in range()", Hi, _AP_W);
+    _AP_ERROR(Lo >= _AP_W, "Lo(%d)out of bound(%d) in range()", Lo, _AP_W);
+    return ap_range_ref<_AP_W, _AP_S>(const_cast<ap_int_base*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> range(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> range() {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> range() const {
+    return this->range(_AP_W - 1, 0);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return this->range(Hi, Lo);
+  }
+
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE ap_range_ref<_AP_W, _AP_S> operator()(
+      const ap_int_base<_AP_W2, _AP_S2>& HiIdx,
+      const ap_int_base<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+#if 0
+  template<int Hi, int Lo>
+  INLINE ap_int_base<Hi-Lo+1, false> slice() const {
+    AP_ASSERT(Hi >= Lo && Hi < _AP_W && Lo < _AP_W, "Out of bounds in slice()");
+    ap_int_base<Hi-Lo+1, false> tmp ;
+    tmp.V = _AP_ROOT_op_get_range(Base::V, Lo, Hi);
+    return tmp;
+  }
+
+  INLINE ap_bit_ref<_AP_W,_AP_S> operator [] ( unsigned int uindex) {
+    AP_ASSERT(uindex < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W,_AP_S> bvh( this, uindex );
+    return bvh;
+  }
+#endif
+
+  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
+    return bvh;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
+    return bvh;
+  }
+
+  INLINE bool operator[](int index) const {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index);
+    return br.to_bool();
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator[](const ap_int_base<_AP_W2, _AP_S2>& index) const {
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index.to_int());
+    return br.to_bool();
+  }
+
+  INLINE ap_bit_ref<_AP_W, _AP_S> bit(int index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index);
+    return bvh;
+  }
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref<_AP_W, _AP_S> bit(
+      const ap_int_base<_AP_W2, _AP_S2>& index) {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> bvh(this, index.to_int());
+    return bvh;
+  }
+
+  INLINE bool bit(int index) const {
+    AP_ASSERT(index >= 0, "Attempting to read bit with negative index");
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W, _AP_S> br(this, index);
+    return br.to_bool();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool bit(const ap_int_base<_AP_W2, _AP_S2>& index) const {
+    return bit(index.to_int());
+  }
+
+#if 0
+  template<typename _AP_T>
+  INLINE bool operator[](_AP_T index) const {
+    AP_ASSERT(index < _AP_W, "Attempting to read bit beyond MSB");
+    ap_bit_ref<_AP_W,_AP_S> br = operator[](index);
+    return br.to_bool();
+  }
+#endif
+
+  // Count the number of zeros from the most significant bit
+  // to the first one bit.
+  INLINE int countLeadingZeros() {
+#ifdef __SYNTHESIS__
+    if (_AP_W <= 32) {
+      ap_int_base<32, false> t(-1UL), x;
+      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
+      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
+      return __builtin_ctz(t.V); // count trailing zeros.
+    } else if (_AP_W <= 64) {
+      ap_int_base<64, false> t(-1ULL);
+      ap_int_base<64, false> x;
+      x.V = _AP_ROOT_op_get_range(this->V, _AP_W - 1, 0); // reverse
+      t.V = _AP_ROOT_op_set_range(t.V, 0, _AP_W - 1, x.V);
+      return __builtin_ctzll(t.V); // count trailing zeros.
+    } else {
+      enum { __N = (_AP_W + 63) / 64 };
+      int NZeros = 0;
+      int i = 0;
+      bool hitNonZero = false;
+      for (i = 0; i < __N - 1; ++i) {
+        ap_int_base<64, false> t;
+        t.V = _AP_ROOT_op_get_range(this->V, _AP_W - i * 64 - 64, _AP_W - i * 64 - 1);
+        NZeros += hitNonZero ? 0 : __builtin_clzll(t.V); // count leading zeros.
+        hitNonZero |= (t.V != 0);
+      }
+      if (!hitNonZero) {
+        ap_int_base<64, false> t(-1ULL);
+        enum { REST = (_AP_W - 1) % 64 };
+        ap_int_base<64, false> x;
+        x.V = _AP_ROOT_op_get_range(this->V, 0, REST);
+        t.V = _AP_ROOT_op_set_range(t.V, 63 - REST, 63, x.V);
+        NZeros += __builtin_clzll(t.V);
+      }
+      return NZeros;
+    }
+#else
+    return (Base::V).countLeadingZeros();
+#endif
+  } // countLeadingZeros
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  concat(const ap_int_base<_AP_W2, _AP_S2>& a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  concat(ap_int_base<_AP_W2, _AP_S2>& a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this), a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) const {
+    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, a2);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_int_base, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+                                                                         a2);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_int_base, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                &a2) const {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_int_base, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+                                                                       a2);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &a2) const {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        const_cast<ap_int_base<_AP_W, _AP_S>&>(*this),
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_int_base, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(
+          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_int_base, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this & a2.get();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this | a2.get();
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_int_base<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+    return *this ^ a2.get();
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    Base::V = val.V;
+  }
+
+  /* Reduce operations.
+   * ----------------------------------------------------------------
+   */
+  // XXX non-const version deleted.
+  INLINE bool and_reduce() const { return _AP_ROOT_op_reduce(and, Base::V); }
+  INLINE bool nand_reduce() const { return _AP_ROOT_op_reduce(nand, Base::V); }
+  INLINE bool or_reduce() const { return _AP_ROOT_op_reduce(or, Base::V); }
+  INLINE bool nor_reduce() const { return !(_AP_ROOT_op_reduce(or, Base::V)); }
+  INLINE bool xor_reduce() const { return _AP_ROOT_op_reduce (xor, Base::V); }
+  INLINE bool xnor_reduce() const {
+    return !(_AP_ROOT_op_reduce (xor, Base::V));
+  }
+
+  /* Output as a string.
+   * ----------------------------------------------------------------
+   */
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char rd = 2, bool sign = _AP_S) const {
+    // XXX in autosim/autowrap.tcl "(${name}).to_string(2).c_str()" is used to
+    // initialize sc_lv, which seems incapable of handling format "-0b".
+    if (rd == 2) sign = false;
+    return (Base::V).to_string(rd, sign);
+  }
+#else
+  INLINE char* to_string(signed char rd = 2, bool sign = _AP_S) const {
+    return 0;
+  }
+#endif
+}; // struct ap_int_base
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::ostream& operator<<(std::ostream& os,
+                                const ap_int_base<_AP_W, _AP_S>& x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.to_string(16); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.to_string(8); // don't print sign
+  } else {
+    os << x.to_string(10);
+  }
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::istream& operator>>(std::istream& in,
+                                ap_int_base<_AP_W, _AP_S>& op) {
+  std::string str;
+  in >> str;
+  const std::ios_base::fmtflags basefield = in.flags() & std::ios_base::basefield;
+  unsigned radix = (basefield == std::ios_base::dec) ? 0 : (
+                     (basefield == std::ios_base::oct) ? 8 : (
+                       (basefield == std::ios_base::hex) ? 16 : 0));
+  op = ap_int_base<_AP_W, _AP_S>(str.c_str(), radix);
+  return in;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/* Operators with another ap_int_base.
+ * ----------------------------------------------------------------
+ */
+#define OP_BIN_AP(Sym, Rty)                                                   \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
+      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
+                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base lhs(op);                                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base rhs(op2);                                 \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
+    ret.V = lhs.V Sym rhs.V;                                                  \
+    return ret;                                                               \
+  }
+
+OP_BIN_AP(*, mult)
+OP_BIN_AP(+, plus)
+OP_BIN_AP(-, minus)
+OP_BIN_AP(&, logic)
+OP_BIN_AP(|, logic)
+OP_BIN_AP(^, logic)
+
+#define OP_BIN_AP2(Sym, Rty)                                                  \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2, _AP_S2>::Rty \
+      operator Sym(const ap_int_base<_AP_W, _AP_S>& op,                       \
+                   const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    typename ap_int_base<_AP_W, _AP_S>::template RType<                       \
+        _AP_W2, _AP_S2>::Rty##_base ret;                                      \
+    ret.V = op.V Sym op2.V;                                                   \
+    return ret;                                                               \
+  }
+
+OP_BIN_AP2(/, div)
+OP_BIN_AP2(%, mod)
+
+// shift operators are defined inside class.
+// compound assignment operators are defined inside class.
+
+/* Operators with a pointer type.
+ * ----------------------------------------------------------------
+ *   char a[100];
+ *   char* ptr = a;
+ *   ap_int<2> n = 3;
+ *   char* ptr2 = ptr + n*2;
+ * avoid ambiguous errors.
+ */
+#define OP_BIN_WITH_PTR(BIN_OP)                                           \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
+  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                        \
+                                   const ap_int_base<_AP_W, _AP_S>& op) { \
+    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
+    return i_op BIN_OP op2;                                               \
+  }                                                                       \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                     \
+  INLINE PTR_TYPE* operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                                   PTR_TYPE* i_op) {                      \
+    ap_slong op2 = op.to_int64(); /* Not all implementation */            \
+    return op2 BIN_OP i_op;                                               \
+  }
+
+OP_BIN_WITH_PTR(+)
+OP_BIN_WITH_PTR(-)
+
+/* Operators with a native floating point types.
+ * ----------------------------------------------------------------
+ */
+// float OP ap_int
+// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
+#define OP_BIN_WITH_FLOAT(BIN_OP, C_TYPE)                              \
+  template <int _AP_W, bool _AP_S>                                     \
+  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                           \
+                                const ap_int_base<_AP_W, _AP_S>& op) { \
+    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
+    return i_op BIN_OP op2;                                            \
+  }                                                                    \
+  template <int _AP_W, bool _AP_S>                                     \
+  INLINE C_TYPE operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                                C_TYPE i_op) {                         \
+    typename ap_int_base<_AP_W, _AP_S>::RetType op2 = op;              \
+    return op2 BIN_OP i_op;                                            \
+  }
+
+#define ALL_OP_WITH_FLOAT(C_TYPE) \
+  OP_BIN_WITH_FLOAT(*, C_TYPE) \
+  OP_BIN_WITH_FLOAT(/, C_TYPE) \
+  OP_BIN_WITH_FLOAT(+, C_TYPE) \
+  OP_BIN_WITH_FLOAT(-, C_TYPE)
+
+#if _AP_ENABLE_HALF_ == 1
+ALL_OP_WITH_FLOAT(half)
+#endif
+ALL_OP_WITH_FLOAT(float)
+ALL_OP_WITH_FLOAT(double)
+
+// TODO no shift?
+
+/* Operators with a native integral types.
+ * ----------------------------------------------------------------
+ */
+// arithmetic and bitwise operators.
+#define OP_BIN_WITH_INT(BIN_OP, C_TYPE, _AP_W2, _AP_S2, RTYPE)             \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
+                                                            _AP_S2>::RTYPE \
+  operator BIN_OP(C_TYPE i_op, const ap_int_base<_AP_W, _AP_S>& op) {      \
+    return ap_int_base<_AP_W2, _AP_S2>(i_op) BIN_OP(op);                   \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W2,        \
+                                                            _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W, _AP_S>& op, C_TYPE i_op) {      \
+    return op BIN_OP ap_int_base<_AP_W2, _AP_S2>(i_op);                    \
+  }
+
+#define ALL_OP_BIN_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  OP_BIN_WITH_INT(*, C_TYPE, _AP_W2, _AP_S2, mult)  \
+  OP_BIN_WITH_INT(+, C_TYPE, _AP_W2, _AP_S2, plus)  \
+  OP_BIN_WITH_INT(-, C_TYPE, _AP_W2, _AP_S2, minus) \
+  OP_BIN_WITH_INT(/, C_TYPE, _AP_W2, _AP_S2, div)   \
+  OP_BIN_WITH_INT(%, C_TYPE, _AP_W2, _AP_S2, mod)   \
+  OP_BIN_WITH_INT(&, C_TYPE, _AP_W2, _AP_S2, logic) \
+  OP_BIN_WITH_INT(|, C_TYPE, _AP_W2, _AP_S2, logic) \
+  OP_BIN_WITH_INT(^, C_TYPE, _AP_W2, _AP_S2, logic)
+
+ALL_OP_BIN_WITH_INT(bool, 1, false)
+ALL_OP_BIN_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_BIN_WITH_INT(signed char, 8, true)
+ALL_OP_BIN_WITH_INT(unsigned char, 8, false)
+ALL_OP_BIN_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_BIN_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_BIN_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_BIN_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_BIN_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_BIN_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_BIN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_BIN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_BIN_WITH_INT
+#undef ALL_OP_BIN_WITH_INT
+
+// shift operators.
+#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    if (_AP_S2)                                          \
+      r.V = op2 >= 0 ? (op.V << op2) : (op.V >> (-op2)); \
+    else                                                 \
+      r.V = op.V << op2;                                 \
+    return r;                                            \
+  }                                                      \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    if (_AP_S2)                                          \
+      r.V = op2 >= 0 ? (op.V >> op2) : (op.V << (-op2)); \
+    else                                                 \
+      r.V = op.V >> op2;                                 \
+    return r;                                            \
+  }
+
+ALL_OP_SHIFT_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_SHIFT_WITH_INT(signed char, 8, true)
+ALL_OP_SHIFT_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_SHIFT_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_SHIFT_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_SHIFT_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+
+#undef ALL_OP_SHIFT_WITH_INT
+
+#define ALL_OP_SHIFT_WITH_INT(C_TYPE, _AP_W2, _AP_S2)    \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator<<(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    r.V = op.V << op2;                                   \
+    return r;                                            \
+  }                                                      \
+  template <int _AP_W, bool _AP_S>                       \
+  INLINE typename ap_int_base<_AP_W, _AP_S>::template RType<_AP_W,_AP_S>::arg1 operator>>(           \
+      const ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) { \
+    ap_int_base<_AP_W, _AP_S> r;                         \
+    r.V = op.V >> op2;                                   \
+    return r;                                            \
+  }
+ALL_OP_SHIFT_WITH_INT(bool, 1, false)
+ALL_OP_SHIFT_WITH_INT(unsigned char, 8, false)
+ALL_OP_SHIFT_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_SHIFT_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_SHIFT_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_SHIFT_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef ALL_OP_SHIFT_WITH_INT
+
+// compound assign operators.
+#define OP_ASSIGN_WITH_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE ap_int_base<_AP_W, _AP_S>& operator ASSIGN_OP(             \
+      ap_int_base<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
+    return op ASSIGN_OP ap_int_base<_AP_W2, _AP_S2>(op2);           \
+  }
+
+// TODO int a; ap_int<16> b; a += b;
+
+#define ALL_OP_ASSIGN_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
+  OP_ASSIGN_WITH_INT(+=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(-=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(*=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(/=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(%=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(&=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(|=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(^=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_ASSIGN_WITH_INT(>>=, C_TYPE, _AP_W2, _AP_S2)      \
+  OP_ASSIGN_WITH_INT(<<=, C_TYPE, _AP_W2, _AP_S2)
+
+ALL_OP_ASSIGN_WITH_INT(bool, 1, false)
+ALL_OP_ASSIGN_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_ASSIGN_WITH_INT(signed char, 8, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned char, 8, false)
+ALL_OP_ASSIGN_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_ASSIGN_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_ASSIGN_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_ASSIGN_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_ASSIGN_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_ASSIGN_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_ASSIGN_WITH_INT
+#undef ALL_OP_ASSIGN_WITH_INT
+
+// equality and relational operators.
+#define OP_REL_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)              \
+  template <int _AP_W, bool _AP_S>                                   \
+  INLINE bool operator REL_OP(C_TYPE i_op,                           \
+                              const ap_int_base<_AP_W, _AP_S>& op) { \
+    return ap_int_base<_AP_W2, _AP_S2>(i_op) REL_OP op;              \
+  }                                                                  \
+  template <int _AP_W, bool _AP_S>                                   \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W, _AP_S>& op,   \
+                              C_TYPE op2) {                          \
+    return op REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);               \
+  }
+
+#define ALL_OP_REL_WITH_INT(C_TYPE, _AP_W2, _AP_S2) \
+  OP_REL_WITH_INT(>, C_TYPE, _AP_W2, _AP_S2)        \
+  OP_REL_WITH_INT(<, C_TYPE, _AP_W2, _AP_S2)        \
+  OP_REL_WITH_INT(>=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(<=, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(==, C_TYPE, _AP_W2, _AP_S2)       \
+  OP_REL_WITH_INT(!=, C_TYPE, _AP_W2, _AP_S2)
+
+ALL_OP_REL_WITH_INT(bool, 1, false)
+ALL_OP_REL_WITH_INT(char, 8, CHAR_IS_SIGNED)
+ALL_OP_REL_WITH_INT(signed char, 8, true)
+ALL_OP_REL_WITH_INT(unsigned char, 8, false)
+ALL_OP_REL_WITH_INT(short, _AP_SIZE_short, true)
+ALL_OP_REL_WITH_INT(unsigned short, _AP_SIZE_short, false)
+ALL_OP_REL_WITH_INT(int, _AP_SIZE_int, true)
+ALL_OP_REL_WITH_INT(unsigned int, _AP_SIZE_int, false)
+ALL_OP_REL_WITH_INT(long, _AP_SIZE_long, true)
+ALL_OP_REL_WITH_INT(unsigned long, _AP_SIZE_long, false)
+ALL_OP_REL_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+ALL_OP_REL_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef OP_REL_WITH_INT
+#undef ALL_OP_BIN_WITH_INT
+
+#define OP_REL_WITH_DOUBLE_OR_FLOAT(Sym)                            \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
+                           double op2) {                            \
+    return op1.to_double() Sym op2 ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(double op1,                              \
+                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
+    return op1 Sym op2.to_double() ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(const ap_int_base<_AP_W, _AP_S>& op1,    \
+                           float op2) {                             \
+    return op1.to_double() Sym op2 ;                                \
+  }                                                                 \
+  template <int _AP_W, bool _AP_S>                                  \
+  INLINE bool operator Sym(float op1,                               \
+                           const ap_int_base<_AP_W, _AP_S>& op2) {  \
+    return op1 Sym op2.to_double() ;                                \
+  }
+  OP_REL_WITH_DOUBLE_OR_FLOAT(>)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(<)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(>=)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(<=)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(==)
+  OP_REL_WITH_DOUBLE_OR_FLOAT(!=)
+
+#undef OP_REL_WITH_DOUBLE_OR_FLOAT
+
+
+/* Operators with ap_bit_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+#define OP_BIN_WITH_RANGE(BIN_OP, RTYPE)                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                              _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,                   \
+                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                  \
+    return ap_int_base<_AP_W1, false>(op1) BIN_OP op2;                       \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                              _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                    \
+                  const ap_range_ref<_AP_W2, _AP_S2>& op2) {                 \
+    return op1 BIN_OP ap_int_base<_AP_W2, false>(op2);                       \
+  }
+
+OP_BIN_WITH_RANGE(+, plus)
+OP_BIN_WITH_RANGE(-, minus)
+OP_BIN_WITH_RANGE(*, mult)
+OP_BIN_WITH_RANGE(/, div)
+OP_BIN_WITH_RANGE(%, mod)
+OP_BIN_WITH_RANGE(&, logic)
+OP_BIN_WITH_RANGE(|, logic)
+OP_BIN_WITH_RANGE(^, logic)
+OP_BIN_WITH_RANGE(>>, arg1)
+OP_BIN_WITH_RANGE(<<, arg1)
+
+#undef OP_BIN_WITH_RANGE
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_RANGE(ASSIGN_OP)                                      \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                    \
+      ap_int_base<_AP_W1, _AP_S1>& op1, ap_range_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1 ASSIGN_OP ap_int_base<_AP_W2, false>(op2);                    \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE ap_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
+      ap_range_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    ap_int_base<_AP_W1, false> tmp(op1);                                     \
+    tmp ASSIGN_OP op2;                                                       \
+    op1 = tmp;                                                               \
+    return op1;                                                              \
+  }
+
+OP_ASSIGN_WITH_RANGE(+=)
+OP_ASSIGN_WITH_RANGE(-=)
+OP_ASSIGN_WITH_RANGE(*=)
+OP_ASSIGN_WITH_RANGE(/=)
+OP_ASSIGN_WITH_RANGE(%=)
+OP_ASSIGN_WITH_RANGE(&=)
+OP_ASSIGN_WITH_RANGE(|=)
+OP_ASSIGN_WITH_RANGE(^=)
+OP_ASSIGN_WITH_RANGE(>>=)
+OP_ASSIGN_WITH_RANGE(<<=)
+
+#undef OP_ASSIGN_WITH_RANGE
+
+// equality and relational operators
+#define OP_REL_WITH_RANGE(REL_OP)                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE bool operator REL_OP(const ap_range_ref<_AP_W1, _AP_S1>& op1,     \
+                              const ap_int_base<_AP_W2, _AP_S2>& op2) {    \
+    return ap_int_base<_AP_W1, false>(op1).operator REL_OP(op2);           \
+  }                                                                        \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,      \
+                              const ap_range_ref<_AP_W2, _AP_S2>& op2) {   \
+    return op1.operator REL_OP(op2.operator ap_int_base<_AP_W2, false>()); \
+  }
+
+OP_REL_WITH_RANGE(==)
+OP_REL_WITH_RANGE(!=)
+OP_REL_WITH_RANGE(>)
+OP_REL_WITH_RANGE(>=)
+OP_REL_WITH_RANGE(<)
+OP_REL_WITH_RANGE(<=)
+
+#undef OP_REL_WITH_RANGE
+
+/* Operators with ap_bit_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+#define OP_BIN_WITH_BIT(BIN_OP, RTYPE)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE typename ap_int_base<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
+  operator BIN_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,                      \
+                  const ap_bit_ref<_AP_W2, _AP_S2>& op2) {                     \
+    return op1 BIN_OP ap_int_base<1, false>(op2);                              \
+  }                                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE typename ap_int_base<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,                       \
+                  const ap_int_base<_AP_W2, _AP_S2>& op2) {                    \
+    return ap_int_base<1, false>(op1) BIN_OP op2;                              \
+  }
+
+OP_BIN_WITH_BIT(+, plus)
+OP_BIN_WITH_BIT(-, minus)
+OP_BIN_WITH_BIT(*, mult)
+OP_BIN_WITH_BIT(/, div)
+OP_BIN_WITH_BIT(%, mod)
+OP_BIN_WITH_BIT(&, logic)
+OP_BIN_WITH_BIT(|, logic)
+OP_BIN_WITH_BIT(^, logic)
+OP_BIN_WITH_BIT(>>, arg1)
+OP_BIN_WITH_BIT(<<, arg1)
+
+#undef OP_BIN_WITH_BIT
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_BIT(ASSIGN_OP)                                      \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE ap_int_base<_AP_W1, _AP_S1>& operator ASSIGN_OP(                  \
+      ap_int_base<_AP_W1, _AP_S1>& op1, ap_bit_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1 ASSIGN_OP ap_int_base<1, false>(op2);                       \
+  }                                                                        \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>              \
+  INLINE ap_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(                   \
+      ap_bit_ref<_AP_W1, _AP_S1>& op1, ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    ap_int_base<1, false> tmp(op1);                                        \
+    tmp ASSIGN_OP op2;                                                     \
+    op1 = tmp;                                                             \
+    return op1;                                                            \
+  }
+
+OP_ASSIGN_WITH_BIT(+=)
+OP_ASSIGN_WITH_BIT(-=)
+OP_ASSIGN_WITH_BIT(*=)
+OP_ASSIGN_WITH_BIT(/=)
+OP_ASSIGN_WITH_BIT(%=)
+OP_ASSIGN_WITH_BIT(&=)
+OP_ASSIGN_WITH_BIT(|=)
+OP_ASSIGN_WITH_BIT(^=)
+OP_ASSIGN_WITH_BIT(>>=)
+OP_ASSIGN_WITH_BIT(<<=)
+
+#undef OP_ASSIGN_WITH_BIT
+
+// equality and relational operators.
+#define OP_REL_WITH_BIT(REL_OP)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
+  INLINE bool operator REL_OP(const ap_int_base<_AP_W1, _AP_S1>& op1,   \
+                              const ap_bit_ref<_AP_W2, _AP_S2>& op2) {  \
+    return op1 REL_OP ap_int_base<1, false>(op2);                       \
+  }                                                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>           \
+  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W1, _AP_S1>& op1,    \
+                              const ap_int_base<_AP_W2, _AP_S2>& op2) { \
+    return ap_int_base<1, false>(op1) REL_OP op2;                       \
+  }
+
+OP_REL_WITH_BIT(==)
+OP_REL_WITH_BIT(!=)
+OP_REL_WITH_BIT(>)
+OP_REL_WITH_BIT(>=)
+OP_REL_WITH_BIT(<)
+OP_REL_WITH_BIT(<=)
+
+#undef OP_REL_WITH_BIT
+
+
+/* Operators with ap_concat_ref.
+ * ------------------------------------------------------------
+ */
+// arithmetic, bitwise and shift operators.
+// bitwise operators are defined in struct.
+// TODO specify whether to define arithmetic and bitwise operators.
+#if 0
+#define OP_BIN_WITH_CONCAT(BIN_OP, RTYPE)                                      \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
+                                                              false>::RTYPE    \
+  operator BIN_OP(const ap_int_base<_AP_W3, _AP_S3>& op1,                      \
+                  const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {  \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1 BIN_OP op2.get();                                               \
+  }                                                                            \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
+                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
+  operator BIN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,    \
+                  const ap_int_base<_AP_W3, _AP_S3>& op2) {                    \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1.get() BIN_OP op2;                                               \
+  }
+
+OP_BIN_WITH_CONCAT(+, plus)
+OP_BIN_WITH_CONCAT(-, minus)
+OP_BIN_WITH_CONCAT(*, mult)
+OP_BIN_WITH_CONCAT(/, div)
+OP_BIN_WITH_CONCAT(%, mod)
+OP_BIN_WITH_CONCAT(&, logic)
+OP_BIN_WITH_CONCAT(|, logic)
+OP_BIN_WITH_CONCAT(^, logic)
+OP_BIN_WITH_CONCAT(>>, arg1)
+OP_BIN_WITH_CONCAT(<<, arg1)
+
+#undef OP_BIN_WITH_CONCAT
+
+// compound assignment operators.
+#define OP_ASSIGN_WITH_CONCAT(ASSIGN_OP)                                       \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W3, _AP_S3>::template RType<_AP_W1 + _AP_W2, \
+                                                              false>::RTYPE    \
+  operator ASSIGN_OP(                                                          \
+      const ap_int_base<_AP_W3, _AP_S3>& op1,                                  \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {              \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    return op1 ASSIGN_OP op2.get();                                            \
+  }                                                                            \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2,          \
+            int _AP_W3, bool _AP_S3>                                           \
+  INLINE typename ap_int_base<_AP_W1 + _AP_W2,                                 \
+                              false>::template RType<_AP_W3, _AP_S3>::RTYPE    \
+  operator ASSIGN_OP(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1, \
+                     const ap_int_base<_AP_W3, _AP_S3>& op2) {                 \
+    /* convert ap_concat_ref to ap_int_base */                                 \
+    ap_int_base<_AP_W1 + _AP_W2, false> tmp = op1.get();                       \
+    tmp ASSIGN_OP op2;                                                         \
+    op1 = tmp;                                                                 \
+    return op1;                                                                \
+  }
+
+OP_ASSIGN_WITH_CONCAT(+=)
+OP_ASSIGN_WITH_CONCAT(-=)
+OP_ASSIGN_WITH_CONCAT(*=)
+OP_ASSIGN_WITH_CONCAT(/=)
+OP_ASSIGN_WITH_CONCAT(%=)
+OP_ASSIGN_WITH_CONCAT(&=)
+OP_ASSIGN_WITH_CONCAT(|=)
+OP_ASSIGN_WITH_CONCAT(^=)
+OP_ASSIGN_WITH_CONCAT(>>=)
+OP_ASSIGN_WITH_CONCAT(<<=)
+
+#undef OP_ASSIGN_WITH_CONCAT
+#endif
+
+// equality and relational operators.
+#define OP_REL_WITH_CONCAT(REL_OP)                                    \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
+            int _AP_W3, bool _AP_S3>                                  \
+  INLINE bool operator REL_OP(                                        \
+      const ap_int_base<_AP_W3, _AP_S3>& op1,                         \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op2) {     \
+    /* convert ap_concat_ref to ap_int_base */                        \
+    return op1 REL_OP op2.get();                                      \
+  }                                                                   \
+  template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2, \
+            int _AP_W3, bool _AP_S3>                                  \
+  INLINE bool operator REL_OP(                                        \
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& op1,       \
+      const ap_int_base<_AP_W3, _AP_S3>& op2) {                       \
+    /* convert ap_concat_ref to ap_int_base */                        \
+    return op1.get() REL_OP op2;                                      \
+  }
+
+OP_REL_WITH_CONCAT(==)
+OP_REL_WITH_CONCAT(!=)
+OP_REL_WITH_CONCAT(>)
+OP_REL_WITH_CONCAT(>=)
+OP_REL_WITH_CONCAT(<)
+OP_REL_WITH_CONCAT(<=)
+
+#undef OP_REL_WITH_CONCAT
+
+#endif // ifndef __cplusplus
+#endif // ifndef __AP_INT_BASE_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int_ref.h b/hls4ml/templates/vivado/ap_types/ap_int_ref.h
index c675ddd4b6..421f09fda6 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int_ref.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int_ref.h
@@ -1,1346 +1,1346 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_REF_H__
-#define __AP_INT_REF_H__
-
-#ifndef __AP_INT_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __cplusplus
-#error "C++ is required to include this header file"
-
-#else
-
-#ifndef __SYNTHESIS__
-#include <iostream>
-#endif
-
-/* Concatination reference.
-   ----------------------------------------------------------------
-*/
-template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
-struct ap_concat_ref {
-  enum {
-    _AP_WR = _AP_W1 + _AP_W2,
-  };
-
-  _AP_T1& mbv1;
-  _AP_T2& mbv2;
-
-  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& ref)
-      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
-
-  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
-    int W_ref1 = mbv1.length();
-    int W_ref2 = mbv2.length();
-    ap_int_base<_AP_W1, false> Part1;
-    Part1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
-    mbv1.set(Part1);
-    ap_int_base<_AP_W2, false> Part2;
-    Part2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
-    mbv2.set(Part2);
-    return *this;
-  }
-
-  // assign op from hls supported C integral types.
-  // FIXME disabled to support legacy code directly assign from sc_signal<T>
-  //template <typename T>
-  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
-  //                                    ap_concat_ref&>::type
-  //operator=(T val) {
-  //  ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-  //  return operator=(tmpVal);
-  //}
-#define ASSIGN_WITH_CTYPE(_Tp)                       \
-  INLINE ap_concat_ref& operator=(_Tp val) {         \
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); \
-    return operator=(tmpVal);                        \
-  }
-
-  ASSIGN_WITH_CTYPE(bool)
-  ASSIGN_WITH_CTYPE(char)
-  ASSIGN_WITH_CTYPE(signed char)
-  ASSIGN_WITH_CTYPE(unsigned char)
-  ASSIGN_WITH_CTYPE(short)
-  ASSIGN_WITH_CTYPE(unsigned short)
-  ASSIGN_WITH_CTYPE(int)
-  ASSIGN_WITH_CTYPE(unsigned int)
-  ASSIGN_WITH_CTYPE(long)
-  ASSIGN_WITH_CTYPE(unsigned long)
-  ASSIGN_WITH_CTYPE(ap_slong)
-  ASSIGN_WITH_CTYPE(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_WITH_CTYPE(half)
-#endif
-  ASSIGN_WITH_CTYPE(float)
-  ASSIGN_WITH_CTYPE(double)
-
-#undef ASSIGN_WITH_CTYPE
-
-  // Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE ap_concat_ref& operator=(
-      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-  INLINE ap_concat_ref& operator=(
-      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref& operator=(const ap_bit_ref<_AP_W3, _AP_S3>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref& operator=(const ap_range_ref<_AP_W3, _AP_S3>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref& operator=(
-      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
-    return operator=((const ap_int_base<_AP_W3, false>)(val));
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref& operator=(
-      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
-          val) {
-    return operator=(val.to_ap_int_base());
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref& operator=(
-      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
-    return operator=((ap_ulong)(bool)(val));
-  }
-
-  INLINE operator ap_int_base<_AP_WR, false>() const { return get(); }
-
-  INLINE operator ap_ulong() const { return get().to_uint64(); }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                       ap_range_ref<_AP_W3, _AP_S3> >
-  operator,(const ap_range_ref<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_range_ref<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_range_ref<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(ap_int_base<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(*this, a2);
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(const ap_int_base<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
-      operator,(const volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
-    // FIXME op's life does not seem long enough
-    ap_int_base<_AP_W3, _AP_S3> op(a2);
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-                         ap_int_base<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(op));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >
-  operator,(const ap_bit_ref<_AP_W3, _AP_S3> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >(
-        *this, const_cast<ap_bit_ref<_AP_W3, _AP_S3>&>(a2));
-  }
-
-  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
-  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
-    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
-        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>&>(a2));
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE ap_concat_ref<
-      _AP_WR, ap_concat_ref, _AP_W3,
-      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
-  operator,(
-      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) {
-    return ap_concat_ref<
-        _AP_WR, ap_concat_ref, _AP_W3,
-        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-        *this,
-        const_cast<
-            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(a2));
-  }
-
-  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-            ap_o_mode _AP_O3, int _AP_N3>
-  INLINE
-      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
-                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
-      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
-                    &a2) {
-    return ap_concat_ref<
-        _AP_WR, ap_concat_ref, 1,
-        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(
-            a2));
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
-      const ap_int_base<_AP_W3, _AP_S3>& a2) {
-    return get() & a2;
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
-      const ap_int_base<_AP_W3, _AP_S3>& a2) {
-    return get() | a2;
-  }
-
-  template <int _AP_W3, bool _AP_S3>
-  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
-      const ap_int_base<_AP_W3, _AP_S3>& a2) {
-    return get() ^ a2;
-  }
-
-#if 0
-  template<int Hi, int Lo>
-  INLINE ap_int_base<Hi-Lo+1, false> slice() {
-    ap_int_base<_AP_WR, false> bv = get();
-    return bv.slice<Hi,Lo>();
-  }
-#endif
-
-  INLINE ap_int_base<_AP_WR, false> get() const {
-    ap_int_base<_AP_WR, false> tmpVal(0);
-    int W_ref1 = mbv1.length();
-    int W_ref2 = mbv2.length();
-    ap_int_base<_AP_W2, false> v2(mbv2);
-    ap_int_base<_AP_W1, false> v1(mbv1);
-    tmpVal.V = _AP_ROOT_op_set_range(tmpVal.V, 0, W_ref2 - 1, v2.V);
-    tmpVal.V =
-        _AP_ROOT_op_set_range(tmpVal.V, W_ref2, W_ref1 + W_ref2 - 1, v1.V);
-    return tmpVal;
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
-    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
-    int W_ref1 = mbv1.length();
-    int W_ref2 = mbv2.length();
-    ap_int_base<_AP_W1, false> tmpVal1;
-    tmpVal1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
-    mbv1.set(tmpVal1);
-    ap_int_base<_AP_W2, false> tmpVal2;
-    tmpVal2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
-    mbv2.set(tmpVal2);
-  }
-
-  INLINE int length() const { return mbv1.length() + mbv2.length(); }
-}; // struct ap_concat_ref
-
-/* Range (slice) reference.
-   ----------------------------------------------------------------
-*/
-template <int _AP_W, bool _AP_S>
-struct ap_range_ref {
-  // struct ssdm_int or its sim model.
-  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
-  //      and then we can retire af_range_ref.
-  typedef ap_int_base<_AP_W, _AP_S> ref_type;
-  ref_type& d_bv;
-  int l_index;
-  int h_index;
-
- public:
-  INLINE ap_range_ref(const ap_range_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
-
-  INLINE ap_range_ref(ref_type* bv, int h, int l)
-      : d_bv(*bv), l_index(l), h_index(h) {}
-
-  INLINE ap_range_ref(const ref_type* bv, int h, int l)
-      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {}
-
-  INLINE operator ap_int_base<_AP_W, false>() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  INLINE operator ap_ulong() const { return to_uint64(); }
-
-  /// @name assign operators
-  //  @{
-
-  // FIXME disabled to work-around lagacy code assigning from sc_signal<T>,
-  // which dependes on implicit type conversion.
-  //
-  //   /// assign from hls supported C integral types.
-  //   template <typename T>
-  //   INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
-  //                                       ap_range_ref&>::type
-  //   operator=(T val) {
-  //     ap_int_base<_AP_W, false> tmp(val);
-  //     d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-  //     return *this;
-  //   }
-#define ASSIGN_WITH_CTYPE(_Tp)                                       \
-  INLINE ap_range_ref& operator=(_Tp val) {                          \
-    ap_int_base<_AP_W, false> tmp(val);                              \
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); \
-    return *this;                                                    \
-  }
-
-  ASSIGN_WITH_CTYPE(bool)
-  ASSIGN_WITH_CTYPE(char)
-  ASSIGN_WITH_CTYPE(signed char)
-  ASSIGN_WITH_CTYPE(unsigned char)
-  ASSIGN_WITH_CTYPE(short)
-  ASSIGN_WITH_CTYPE(unsigned short)
-  ASSIGN_WITH_CTYPE(int)
-  ASSIGN_WITH_CTYPE(unsigned int)
-  ASSIGN_WITH_CTYPE(long)
-  ASSIGN_WITH_CTYPE(unsigned long)
-  ASSIGN_WITH_CTYPE(ap_slong)
-  ASSIGN_WITH_CTYPE(ap_ulong)
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_WITH_CTYPE(half)
-#endif
-  ASSIGN_WITH_CTYPE(float)
-  ASSIGN_WITH_CTYPE(double)
-
-#undef ASSIGN_WITH_CTYPE
-
-  /// assign using string. XXX crucial for cosim.
-  INLINE ap_range_ref& operator=(const char* val) {
-    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-    return *this;
-  }
-
-  /// assign from ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
-    ap_int_base<_AP_W, false> tmp(val);
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
-    return *this;
-  }
-
-  /// copy assign operator
-  // XXX Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE ap_range_ref& operator=(const ap_range_ref& val) {
-    return operator=((const ap_int_base<_AP_W, false>)val);
-  }
-
-  /// assign from range reference to ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((const ap_int_base<_AP_W2, false>)val);
-  }
-
-  /// assign from bit reference to ap_int_base.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_ulong)(bool)(val));
-  }
-
-  /// assign from ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_range_ref& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
-          val) {
-    return operator=(val.to_ap_int_base());
-  }
-
-  /// assign from range reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_range_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((const ap_int_base<_AP_W2, false>)val);
-  }
-
-  /// assign from bit reference to ap_fixed_base.
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_range_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((ap_ulong)(bool)(val));
-  }
-
-  /// assign from compound reference.
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_range_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)(val));
-  }
-  //  @}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
-  }
-
-  INLINE
-  ap_concat_ref<_AP_W, ap_range_ref, _AP_W, ap_int_base<_AP_W, _AP_S> >
-  operator,(ap_int_base<_AP_W, _AP_S>& a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W,
-                         ap_int_base<_AP_W, _AP_S> >(*this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-      operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
-                         ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      _AP_W, ap_range_ref, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> a2) {
-    return ap_concat_ref<
-        _AP_W, ap_range_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<
-            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE
-      ap_concat_ref<_AP_W, ap_range_ref, 1,
-                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-                    &a2) {
-    return ap_concat_ref<
-        _AP_W, ap_range_ref, 1,
-        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> hop(op2);
-    return lop == hop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator==(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> hop(op2);
-    return lop < hop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_int_base<_AP_W, false> lop(*this);
-    ap_int_base<_AP_W2, false> hop(op2);
-    return lop <= hop;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator<=(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    return !(operator<(op2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
-      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V |= (op2.d_bv).V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V |= op2.V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
-      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V &= (op2.d_bv).V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V &= op2.V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
-      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V ^= (op2.d_bv).V;
-    return *this;
-  };
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
-      const ap_int_base<_AP_W2, _AP_S2>& op2) {
-    (this->d_bv).V ^= op2.V;
-    return *this;
-  };
-
-  INLINE ap_int_base<_AP_W, false> get() const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret;
-  }
-
-  template <int _AP_W2>
-  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
-    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
-  }
-
-  INLINE int length() const {
-    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
-  }
-
-  INLINE int to_int() const {
-    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned to_uint() const {
-    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE long to_long() const {
-    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE unsigned long to_ulong() const {
-    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_slong to_int64() const {
-    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE ap_ulong to_uint64() const {
-    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
-  }
-
-  INLINE bool and_reduce() const {
-    bool ret = true;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) {
-#ifdef __SYNTHESIS__
-#pragma HLS unroll
-#endif
-      ret &= _AP_ROOT_op_get_bit(d_bv.V, i);
-    }
-    return ret;
-  }
-
-  INLINE bool or_reduce() const {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) {
-#ifdef __SYNTHESIS__
-#pragma HLS unroll
-#endif
-      ret |= _AP_ROOT_op_get_bit(d_bv.V, i);
-    }
-    return ret;
-  }
-
-  INLINE bool xor_reduce() const {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) {
-#ifdef __SYNTHESIS__
-#pragma HLS unroll
-#endif
-      ret ^= _AP_ROOT_op_get_bit(d_bv.V, i);
-    }
-    return ret;
-  }
-#ifndef __SYNTHESIS__
-  std::string to_string(signed char radix = 2) const {
-    ap_int_base<_AP_W, false> ret;
-    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
-    return ret.to_string(radix);
-  }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string(signed char radix = 2) const {
-    return 0;
-  }
-#endif
-}; // struct ap_range_ref
-
-// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
-#ifndef AP_AUTOCC
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::ostream& operator<<(std::ostream& os,
-                                const ap_range_ref<_AP_W, _AP_S>& x) {
-  std::ios_base::fmtflags ff = std::cout.flags();
-  if (ff & std::cout.hex) {
-    os << x.to_string(16); // don't print sign
-  } else if (ff & std::cout.oct) {
-    os << x.to_string(8); // don't print sign
-  } else {
-    os << x.to_string(10);
-  }
-  return os;
-}
-#endif // ifndef __SYNTHESIS__
-
-#ifndef __SYNTHESIS__
-template <int _AP_W, bool _AP_S>
-INLINE std::istream& operator>>(std::istream& in,
-                                ap_range_ref<_AP_W, _AP_S>& op) {
-  std::string str;
-  in >> str;
-  op = ap_int_base<_AP_W, _AP_S>(str.c_str());
-  return in;
-}
-#endif // ifndef __SYNTHESIS__
-#endif // ifndef AP_AUTOCC
-
-/* Bit reference.
-   ----------------------------------------------------------------
-*/
-template <int _AP_W, bool _AP_S>
-struct ap_bit_ref {
-  // struct ssdm_int or its sim model.
-  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
-  //      and then we can retire af_bit_ref.
-  typedef ap_int_base<_AP_W, _AP_S> ref_type;
-  ref_type& d_bv;
-  int d_index;
-
- public:
-  // copy ctor
-  INLINE ap_bit_ref(const ap_bit_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), d_index(ref.d_index) {}
-
-  INLINE ap_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
-
-  INLINE ap_bit_ref(const ref_type* bv, int index = 0)
-      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
-
-  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-  INLINE bool to_bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  // assign op from hls supported C integral types.
-  // FIXME disabled to support sc_signal<bool>.
-  // NOTE this used to be unsigned long long.
-  //template <typename T>
-  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
-  //                                    ap_bit_ref&>::type
-  //operator=(T val) {
-  //  d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
-  //  return *this;
-  //}
-#define ASSIGN_WITH_CTYPE(_Tp)                          \
-  INLINE ap_bit_ref& operator=(_Tp val) {               \
-    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); \
-    return *this;                                       \
-  }
-
-  ASSIGN_WITH_CTYPE(bool)
-  ASSIGN_WITH_CTYPE(char)
-  ASSIGN_WITH_CTYPE(signed char)
-  ASSIGN_WITH_CTYPE(unsigned char)
-  ASSIGN_WITH_CTYPE(short)
-  ASSIGN_WITH_CTYPE(unsigned short)
-  ASSIGN_WITH_CTYPE(int)
-  ASSIGN_WITH_CTYPE(unsigned int)
-  ASSIGN_WITH_CTYPE(long)
-  ASSIGN_WITH_CTYPE(unsigned long)
-  ASSIGN_WITH_CTYPE(ap_slong)
-  ASSIGN_WITH_CTYPE(ap_ulong)
-
-#undef ASSIGN_WITH_CTYPE
-
-#define ASSIGN_WITH_CTYPE_FP(_Tp)                           \
-  INLINE ap_bit_ref& operator=(_Tp val) {                   \
-    bool tmp_val = val;                                     \
-    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index,tmp_val);  \
-    return *this;                                           \
-  }
-
-#if _AP_ENABLE_HALF_ == 1
-  ASSIGN_WITH_CTYPE_FP(half)
-#endif
-  ASSIGN_WITH_CTYPE_FP(float)
-  ASSIGN_WITH_CTYPE_FP(double)
-
-#undef ASSIGN_WITH_CTYPE_FP
-
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_ulong)(val.V != 0));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_int_base<_AP_W2, false>)val);
-  }
-
-  // Be explicit to prevent it from being deleted, as field d_bv
-  // is of reference type.
-  INLINE ap_bit_ref& operator=(const ap_bit_ref& val) {
-    return operator=((ap_ulong)(bool)val);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((ap_ulong)(bool)val);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_bit_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((const ap_int_base<_AP_W2, false>)val);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_bit_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((ap_ulong)(bool)val);
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_bit_ref& operator=(
-      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
-    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)val);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, a2);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
-    ap_int_base<_AP_W2, _AP_S2> op(a2);
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
-  operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
-    ap_int_base<_AP_W2, _AP_S2> op(a2);
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
-  operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
-      const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
-        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
-  }
-
-  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
-                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
-                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<
-      1, ap_bit_ref, _AP_W2,
-      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<
-        1, ap_bit_ref, _AP_W2,
-        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<
-            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                    _AP_Q2, _AP_O2, _AP_N2> >
-  operator,(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-    return ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
-                                                      _AP_Q2, _AP_O2, _AP_N2> >(
-        *this,
-        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-            a2));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
-    return get() == op.get();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
-    return get() != op.get();
-  }
-
-  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  INLINE bool get() { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
-    operator=(val);
-  }
-
-  INLINE bool operator~() const {
-    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
-    return bit ? false : true;
-  }
-
-  INLINE int length() const { return 1; }
-
-#ifndef __SYNTHESIS__
-  std::string to_string() const { return get() ? "1" : "0"; }
-#else
-  // XXX HLS will delete this in synthesis
-  INLINE char* to_string() const { return 0; }
-#endif
-}; // struct ap_bit_ref
-
-/* ap_range_ref with int.
- * ------------------------------------------------------------
- */
-// equality and relational operators.
-#define REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(const ap_range_ref<_AP_W, _AP_S>& op,        \
-                              C_TYPE op2) {                                \
-    return ap_int_base<_AP_W, false>(op)                                   \
-        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W, _AP_S>& op,          \
-                              C_TYPE op2) {                                \
-    return bool(op) REL_OP op2;                                            \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(C_TYPE op2,                                  \
-                              const ap_bit_ref<_AP_W, _AP_S>& op) {        \
-    return op2 REL_OP bool(op);                                            \
-  }                                                                        \
-  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
-  INLINE bool operator REL_OP(                                             \
-      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, C_TYPE op2) { \
-    return ap_int_base<_AP_W + _AP_W1, false>(op)                          \
-        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
-  }
-
-// Make the line shorter than 5000 chars
-#define REF_REL_WITH_INT_1(C_TYPE, _AP_WI, _AP_SI) \
-  REF_REL_OP_WITH_INT(>, C_TYPE, _AP_WI, _AP_SI)   \
-  REF_REL_OP_WITH_INT(<, C_TYPE, _AP_WI, _AP_SI)   \
-  REF_REL_OP_WITH_INT(>=, C_TYPE, _AP_WI, _AP_SI)  \
-  REF_REL_OP_WITH_INT(<=, C_TYPE, _AP_WI, _AP_SI)
-
-REF_REL_WITH_INT_1(bool, 1, false)
-REF_REL_WITH_INT_1(char, 8, CHAR_IS_SIGNED)
-REF_REL_WITH_INT_1(signed char, 8, true)
-REF_REL_WITH_INT_1(unsigned char, 8, false)
-REF_REL_WITH_INT_1(short, _AP_SIZE_short, true)
-REF_REL_WITH_INT_1(unsigned short, _AP_SIZE_short, false)
-REF_REL_WITH_INT_1(int, _AP_SIZE_int, true)
-REF_REL_WITH_INT_1(unsigned int, _AP_SIZE_int, false)
-REF_REL_WITH_INT_1(long, _AP_SIZE_long, true)
-REF_REL_WITH_INT_1(unsigned long, _AP_SIZE_long, false)
-REF_REL_WITH_INT_1(ap_slong, _AP_SIZE_ap_slong, true)
-REF_REL_WITH_INT_1(ap_ulong, _AP_SIZE_ap_slong, false)
-
-// Make the line shorter than 5000 chars
-#define REF_REL_WITH_INT_2(C_TYPE, _AP_WI, _AP_SI) \
-  REF_REL_OP_WITH_INT(==, C_TYPE, _AP_WI, _AP_SI)  \
-  REF_REL_OP_WITH_INT(!=, C_TYPE, _AP_WI, _AP_SI)
-
-REF_REL_WITH_INT_2(bool, 1, false)
-REF_REL_WITH_INT_2(char, 8, CHAR_IS_SIGNED)
-REF_REL_WITH_INT_2(signed char, 8, true)
-REF_REL_WITH_INT_2(unsigned char, 8, false)
-REF_REL_WITH_INT_2(short, _AP_SIZE_short, true)
-REF_REL_WITH_INT_2(unsigned short, _AP_SIZE_short, false)
-REF_REL_WITH_INT_2(int, _AP_SIZE_int, true)
-REF_REL_WITH_INT_2(unsigned int, _AP_SIZE_int, false)
-REF_REL_WITH_INT_2(long, _AP_SIZE_long, true)
-REF_REL_WITH_INT_2(unsigned long, _AP_SIZE_long, false)
-REF_REL_WITH_INT_2(ap_slong, _AP_SIZE_ap_slong, true)
-REF_REL_WITH_INT_2(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef REF_REL_OP_WITH_INT
-#undef REF_REL_WITH_INT_1
-#undef REF_REL_WITH_INT_2
-
-#define REF_BIN_OP_WITH_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)          \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE typename ap_int_base<_AP_W, false>::template RType<_AP_W2,         \
-                                                            _AP_S2>::RTYPE  \
-  operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& op, C_TYPE op2) {       \
-    return ap_int_base<_AP_W, false>(op)                                    \
-        BIN_OP ap_int_base<_AP_W2, _AP_S2>(op2);                            \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE typename ap_int_base<_AP_W2, _AP_S2>::template RType<_AP_W,        \
-                                                              false>::RTYPE \
-  operator BIN_OP(C_TYPE op2, const ap_range_ref<_AP_W, _AP_S>& op) {       \
-    return ap_int_base<_AP_W2, _AP_S2>(op2)                                 \
-        BIN_OP ap_int_base<_AP_W, false>(op);                               \
-  }
-
-// arithmetic operators.
-#define REF_BIN_OP_WITH_INT_ARITH(C_TYPE, _AP_W2, _AP_S2)   \
-  REF_BIN_OP_WITH_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_WITH_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
-  REF_BIN_OP_WITH_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_BIN_OP_WITH_INT_ARITH(bool, 1, false)
-REF_BIN_OP_WITH_INT_ARITH(char, 8, CHAR_IS_SIGNED)
-REF_BIN_OP_WITH_INT_ARITH(signed char, 8, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned char, 8, false)
-REF_BIN_OP_WITH_INT_ARITH(short, _AP_SIZE_short, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned short, _AP_SIZE_short, false)
-REF_BIN_OP_WITH_INT_ARITH(int, _AP_SIZE_int, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned int, _AP_SIZE_int, false)
-REF_BIN_OP_WITH_INT_ARITH(long, _AP_SIZE_long, true)
-REF_BIN_OP_WITH_INT_ARITH(unsigned long, _AP_SIZE_long, false)
-REF_BIN_OP_WITH_INT_ARITH(ap_slong, _AP_SIZE_ap_slong, true)
-REF_BIN_OP_WITH_INT_ARITH(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef REF_BIN_OP_WITH_INT_ARITH
-
-// bitwise and shift operators
-#define REF_BIN_OP_WITH_INT_BITS(C_TYPE, _AP_W2, _AP_S2)     \
-  REF_BIN_OP_WITH_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_WITH_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_WITH_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_BIN_OP_WITH_INT_BITS(bool, 1, false)
-REF_BIN_OP_WITH_INT_BITS(char, 8, CHAR_IS_SIGNED)
-REF_BIN_OP_WITH_INT_BITS(signed char, 8, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned char, 8, false)
-REF_BIN_OP_WITH_INT_BITS(short, _AP_SIZE_short, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned short, _AP_SIZE_short, false)
-REF_BIN_OP_WITH_INT_BITS(int, _AP_SIZE_int, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned int, _AP_SIZE_int, false)
-REF_BIN_OP_WITH_INT_BITS(long, _AP_SIZE_long, true)
-REF_BIN_OP_WITH_INT_BITS(unsigned long, _AP_SIZE_long, false)
-REF_BIN_OP_WITH_INT_BITS(ap_slong, _AP_SIZE_ap_slong, true)
-REF_BIN_OP_WITH_INT_BITS(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef REF_BIN_OP_WITH_INT_BITS
-
-/* ap_range_ref with ap_range_ref
- *  ------------------------------------------------------------
- */
-#define REF_BIN_OP(BIN_OP, RTYPE)                                              \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                    \
-  INLINE                                                                       \
-      typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
-      operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& lhs,                   \
-                      const ap_range_ref<_AP_W2, _AP_S2>& rhs) {               \
-    return (lhs.operator ap_int_base<_AP_W, false>())BIN_OP(                   \
-        rhs.operator ap_int_base<_AP_W2, false>());                            \
-  }
-
-REF_BIN_OP(+, plus)
-REF_BIN_OP(-, minus)
-REF_BIN_OP(*, mult)
-REF_BIN_OP(/, div)
-REF_BIN_OP(%, mod)
-REF_BIN_OP(&, logic)
-REF_BIN_OP(|, logic)
-REF_BIN_OP(^, logic)
-REF_BIN_OP(>>, arg1)
-REF_BIN_OP(<<, arg1)
-
-/* ap_concat_ref with ap_concat_ref.
- *  ------------------------------------------------------------
- */
-
-//************************************************************************
-//  Implement
-//      ap_int_base<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
-//  for operators  +, -, *, /, %, >>, <<, &, |, ^
-//  Without these operators the operands are converted to int64 and
-//  larger results lose informations (higher order bits).
-//
-//                       operand OP
-//                      /          |
-//              left-concat         right-concat
-//                /     |            /         |
-//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>    <RW2,RT2>
-//
-//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
-//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
-//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
-//
-//  In Verilog 2001 result of concatenation is always unsigned even
-//  when both sides are signed.
-//************************************************************************
-
-#undef SYN_CONCAT_REF_BIN_OP
-
-#define SYN_CONCAT_REF_BIN_OP(BIN_OP, RTYPE)                              \
-  template <int _AP_LW1, typename _AP_LT1, int _AP_LW2, typename _AP_LT2, \
-            int _AP_RW1, typename _AP_RT1, int _AP_RW2, typename _AP_RT2> \
-  INLINE typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType<  \
-      _AP_RW1 + _AP_RW2, false>::RTYPE                                    \
-  operator BIN_OP(                                                        \
-      const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs,       \
-      const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) {     \
-    return lhs.get() BIN_OP rhs.get();                                    \
-  }
-
-SYN_CONCAT_REF_BIN_OP(+, plus)
-SYN_CONCAT_REF_BIN_OP(-, minus)
-SYN_CONCAT_REF_BIN_OP(*, mult)
-SYN_CONCAT_REF_BIN_OP(/, div)
-SYN_CONCAT_REF_BIN_OP(%, mod)
-SYN_CONCAT_REF_BIN_OP(&, logic)
-SYN_CONCAT_REF_BIN_OP(|, logic)
-SYN_CONCAT_REF_BIN_OP(^, logic)
-SYN_CONCAT_REF_BIN_OP(>>, arg1)
-SYN_CONCAT_REF_BIN_OP(<<, arg1)
-
-#undef SYN_CONCAT_REF_BIN_OP
-
-#define CONCAT_OP_WITH_INT(C_TYPE, _AP_WI, _AP_SI)                          \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      const ap_int_base<_AP_W, _AP_S> &op1, C_TYPE op2) {                   \
-    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
-    ret <<= _AP_WI;                                                         \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W;                                                        \
-      val >>= _AP_W;                                                        \
-    }                                                                       \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      C_TYPE op1, const ap_int_base<_AP_W, _AP_S> &op2) {                   \
-    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
-    if (_AP_S) {                                                            \
-      ret <<= _AP_WI;                                                       \
-      ret >>= _AP_WI;                                                       \
-    }                                                                       \
-    ret |= val << _AP_W;                                                    \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      const ap_range_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                  \
-    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
-    ret <<= _AP_WI;                                                         \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W;                                                        \
-      val >>= _AP_W;                                                        \
-    }                                                                       \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      C_TYPE op1, const ap_range_ref<_AP_W, _AP_S> &op2) {                  \
-    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
-    int len = op2.length();                                                 \
-    val <<= len;                                                            \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
-      const ap_bit_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                    \
-    ap_int_base<_AP_WI + 1, false> val(op2);                                \
-    val[_AP_WI] = op1;                                                      \
-    return val;                                                             \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
-      C_TYPE op1, const ap_bit_ref<_AP_W, _AP_S> &op2) {                    \
-    ap_int_base<_AP_WI + 1, false> val(op1);                                \
-    val <<= 1;                                                              \
-    val[0] = op2;                                                           \
-    return val;                                                             \
-  }                                                                         \
-  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
-  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
-      const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, C_TYPE op2) { \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op2);                  \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op1);                  \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W + _AP_W2;                                               \
-      val >>= _AP_W + _AP_W2;                                               \
-    }                                                                       \
-    ret <<= _AP_WI;                                                         \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
-  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
-      C_TYPE op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op1);                  \
-    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op2);                  \
-    int len = op2.length();                                                 \
-    val <<= len;                                                            \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,    \
-      C_TYPE op2) {                                                         \
-    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
-    if (_AP_SI) {                                                           \
-      val <<= _AP_W;                                                        \
-      val >>= _AP_W;                                                        \
-    }                                                                       \
-    ret <<= _AP_WI;                                                         \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
-      C_TYPE op1,                                                           \
-      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {  \
-    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
-    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
-    int len = op2.length();                                                 \
-    val <<= len;                                                            \
-    ret |= val;                                                             \
-    return ret;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,      \
-      C_TYPE op2) {                                                         \
-    ap_int_base<_AP_WI + 1, _AP_SI> val(op2);                               \
-    val[_AP_WI] = op1;                                                      \
-    return val;                                                             \
-  }                                                                         \
-  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
-            ap_o_mode _AP_O, int _AP_N>                                     \
-  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
-      C_TYPE op1,                                                           \
-      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {    \
-    ap_int_base<_AP_WI + 1, _AP_SI> val(op1);                               \
-    val <<= 1;                                                              \
-    val[0] = op2;                                                           \
-    return val;                                                             \
-  }
-
-CONCAT_OP_WITH_INT(bool, 1, false)
-CONCAT_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
-CONCAT_OP_WITH_INT(signed char, 8, true)
-CONCAT_OP_WITH_INT(unsigned char, 8, false)
-CONCAT_OP_WITH_INT(short, _AP_SIZE_short, true)
-CONCAT_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
-CONCAT_OP_WITH_INT(int, _AP_SIZE_int, true)
-CONCAT_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
-CONCAT_OP_WITH_INT(long, _AP_SIZE_long, true)
-CONCAT_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
-CONCAT_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
-CONCAT_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
-
-#undef CONCAT_OP_WITH_INT
-
-#define CONCAT_SHIFT_WITH_INT(C_TYPE, OP)                                  \
-  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
-  INLINE ap_uint<_AP_W + _AP_W1> operator OP(                              \
-      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, C_TYPE rhs) { \
-    return ap_uint<_AP_W + _AP_W1>(lhs).get() OP int(rhs);                 \
-  }
-
-// FIXME int(rhs) may loose precision.
-
-CONCAT_SHIFT_WITH_INT(int, <<)
-CONCAT_SHIFT_WITH_INT(unsigned int, <<)
-CONCAT_SHIFT_WITH_INT(long, <<)
-CONCAT_SHIFT_WITH_INT(unsigned long, <<)
-CONCAT_SHIFT_WITH_INT(ap_slong, <<)
-CONCAT_SHIFT_WITH_INT(ap_ulong, <<)
-
-CONCAT_SHIFT_WITH_INT(int, >>)
-CONCAT_SHIFT_WITH_INT(unsigned int, >>)
-CONCAT_SHIFT_WITH_INT(long, >>)
-CONCAT_SHIFT_WITH_INT(unsigned long, >>)
-CONCAT_SHIFT_WITH_INT(ap_slong, >>)
-CONCAT_SHIFT_WITH_INT(ap_ulong, >>)
-
-#endif // ifndef __cplusplus
-#endif // ifndef __AP_INT_REF_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_REF_H__
+#define __AP_INT_REF_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __cplusplus
+#error "C++ is required to include this header file"
+
+#else
+
+#ifndef __SYNTHESIS__
+#include <iostream>
+#endif
+
+/* Concatination reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+struct ap_concat_ref {
+  enum {
+    _AP_WR = _AP_W1 + _AP_W2,
+  };
+
+  _AP_T1& mbv1;
+  _AP_T2& mbv2;
+
+  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& ref)
+      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
+
+  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_int_base<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W1, false> Part1;
+    Part1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
+    mbv1.set(Part1);
+    ap_int_base<_AP_W2, false> Part2;
+    Part2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
+    mbv2.set(Part2);
+    return *this;
+  }
+
+  // assign op from hls supported C integral types.
+  // FIXME disabled to support legacy code directly assign from sc_signal<T>
+  //template <typename T>
+  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                    ap_concat_ref&>::type
+  //operator=(T val) {
+  //  ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+  //  return operator=(tmpVal);
+  //}
+#define ASSIGN_WITH_CTYPE(_Tp)                       \
+  INLINE ap_concat_ref& operator=(_Tp val) {         \
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val); \
+    return operator=(tmpVal);                        \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE(half)
+#endif
+  ASSIGN_WITH_CTYPE(float)
+  ASSIGN_WITH_CTYPE(double)
+
+#undef ASSIGN_WITH_CTYPE
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_concat_ref& operator=(
+      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+  INLINE ap_concat_ref& operator=(
+      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_bit_ref<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref& operator=(const ap_range_ref<_AP_W3, _AP_S3>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+    return operator=((const ap_int_base<_AP_W3, false>)(val));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
+          val) {
+    return operator=(val.to_ap_int_base());
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref& operator=(
+      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  INLINE operator ap_int_base<_AP_WR, false>() const { return get(); }
+
+  INLINE operator ap_ulong() const { return get().to_uint64(); }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                       ap_range_ref<_AP_W3, _AP_S3> >
+  operator,(const ap_range_ref<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_range_ref<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_range_ref<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(*this, a2);
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(const ap_int_base<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_int_base<_AP_W3, _AP_S3> >
+      operator,(const volatile ap_int_base<_AP_W3, _AP_S3> &a2) {
+    // FIXME op's life does not seem long enough
+    ap_int_base<_AP_W3, _AP_S3> op(a2);
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+                         ap_int_base<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_int_base<_AP_W3, _AP_S3>&>(op));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >
+  operator,(const ap_bit_ref<_AP_W3, _AP_S3> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, ap_bit_ref<_AP_W3, _AP_S3> >(
+        *this, const_cast<ap_bit_ref<_AP_W3, _AP_S3>&>(a2));
+  }
+
+  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
+  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
+    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
+        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>&>(a2));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE ap_concat_ref<
+      _AP_WR, ap_concat_ref, _AP_W3,
+      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+  operator,(
+      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2) {
+    return ap_concat_ref<
+        _AP_WR, ap_concat_ref, _AP_W3,
+        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(a2));
+  }
+
+  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+            ap_o_mode _AP_O3, int _AP_N3>
+  INLINE
+      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
+                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
+                    &a2) {
+    return ap_concat_ref<
+        _AP_WR, ap_concat_ref, 1,
+        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&>(
+            a2));
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() & a2;
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() | a2;
+  }
+
+  template <int _AP_W3, bool _AP_S3>
+  INLINE ap_int_base<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
+      const ap_int_base<_AP_W3, _AP_S3>& a2) {
+    return get() ^ a2;
+  }
+
+#if 0
+  template<int Hi, int Lo>
+  INLINE ap_int_base<Hi-Lo+1, false> slice() {
+    ap_int_base<_AP_WR, false> bv = get();
+    return bv.slice<Hi,Lo>();
+  }
+#endif
+
+  INLINE ap_int_base<_AP_WR, false> get() const {
+    ap_int_base<_AP_WR, false> tmpVal(0);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W2, false> v2(mbv2);
+    ap_int_base<_AP_W1, false> v1(mbv1);
+    tmpVal.V = _AP_ROOT_op_set_range(tmpVal.V, 0, W_ref2 - 1, v2.V);
+    tmpVal.V =
+        _AP_ROOT_op_set_range(tmpVal.V, W_ref2, W_ref1 + W_ref2 - 1, v1.V);
+    return tmpVal;
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    ap_int_base<_AP_W1 + _AP_W2, false> vval(val);
+    int W_ref1 = mbv1.length();
+    int W_ref2 = mbv2.length();
+    ap_int_base<_AP_W1, false> tmpVal1;
+    tmpVal1.V = _AP_ROOT_op_get_range(vval.V, W_ref2, W_ref1 + W_ref2 - 1);
+    mbv1.set(tmpVal1);
+    ap_int_base<_AP_W2, false> tmpVal2;
+    tmpVal2.V = _AP_ROOT_op_get_range(vval.V, 0, W_ref2 - 1);
+    mbv2.set(tmpVal2);
+  }
+
+  INLINE int length() const { return mbv1.length() + mbv2.length(); }
+}; // struct ap_concat_ref
+
+/* Range (slice) reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W, bool _AP_S>
+struct ap_range_ref {
+  // struct ssdm_int or its sim model.
+  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
+  //      and then we can retire af_range_ref.
+  typedef ap_int_base<_AP_W, _AP_S> ref_type;
+  ref_type& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  INLINE ap_range_ref(const ap_range_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  INLINE ap_range_ref(ref_type* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {}
+
+  INLINE ap_range_ref(const ref_type* bv, int h, int l)
+      : d_bv(*const_cast<ref_type*>(bv)), l_index(l), h_index(h) {}
+
+  INLINE operator ap_int_base<_AP_W, false>() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  INLINE operator ap_ulong() const { return to_uint64(); }
+
+  /// @name assign operators
+  //  @{
+
+  // FIXME disabled to work-around lagacy code assigning from sc_signal<T>,
+  // which dependes on implicit type conversion.
+  //
+  //   /// assign from hls supported C integral types.
+  //   template <typename T>
+  //   INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                       ap_range_ref&>::type
+  //   operator=(T val) {
+  //     ap_int_base<_AP_W, false> tmp(val);
+  //     d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+  //     return *this;
+  //   }
+#define ASSIGN_WITH_CTYPE(_Tp)                                       \
+  INLINE ap_range_ref& operator=(_Tp val) {                          \
+    ap_int_base<_AP_W, false> tmp(val);                              \
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V); \
+    return *this;                                                    \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE(half)
+#endif
+  ASSIGN_WITH_CTYPE(float)
+  ASSIGN_WITH_CTYPE(double)
+
+#undef ASSIGN_WITH_CTYPE
+
+  /// assign using string. XXX crucial for cosim.
+  INLINE ap_range_ref& operator=(const char* val) {
+    const ap_int_base<_AP_W, false> tmp(val); // XXX figure out radix
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// assign from ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    ap_int_base<_AP_W, false> tmp(val);
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, tmp.V);
+    return *this;
+  }
+
+  /// copy assign operator
+  // XXX Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_range_ref& operator=(const ap_range_ref& val) {
+    return operator=((const ap_int_base<_AP_W, false>)val);
+  }
+
+  /// assign from range reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  /// assign from bit reference to ap_int_base.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  /// assign from ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&
+          val) {
+    return operator=(val.to_ap_int_base());
+  }
+
+  /// assign from range reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  /// assign from bit reference to ap_fixed_base.
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((ap_ulong)(bool)(val));
+  }
+
+  /// assign from compound reference.
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_range_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)(val));
+  }
+  //  @}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+      operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(*this, a2);
+  }
+
+  INLINE
+  ap_concat_ref<_AP_W, ap_range_ref, _AP_W, ap_int_base<_AP_W, _AP_S> >
+  operator,(ap_int_base<_AP_W, _AP_S>& a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W,
+                         ap_int_base<_AP_W, _AP_S> >(*this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+      operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2,
+                         ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<_AP_W, ap_range_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      _AP_W, ap_range_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> a2) {
+    return ap_concat_ref<
+        _AP_W, ap_range_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE
+      ap_concat_ref<_AP_W, ap_range_ref, 1,
+                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+                    &a2) {
+    return ap_concat_ref<
+        _AP_W, ap_range_ref, 1,
+        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop == hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator==(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop < hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_int_base<_AP_W, false> lop(*this);
+    ap_int_base<_AP_W2, false> hop(op2);
+    return lop <= hop;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<=(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    return !(operator<(op2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V |= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator|=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V |= op2.V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V &= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator&=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V &= op2.V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
+      const ap_range_ref<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V ^= (op2.d_bv).V;
+    return *this;
+  };
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_range_ref<_AP_W, _AP_S>& operator^=(
+      const ap_int_base<_AP_W2, _AP_S2>& op2) {
+    (this->d_bv).V ^= op2.V;
+    return *this;
+  };
+
+  INLINE ap_int_base<_AP_W, false> get() const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_int_base<_AP_W2, false>& val) {
+    d_bv.V = _AP_ROOT_op_set_range(d_bv.V, l_index, h_index, val.V);
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+  INLINE int to_int() const {
+    return (int)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned to_uint() const {
+    return (unsigned)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE long to_long() const {
+    return (long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE unsigned long to_ulong() const {
+    return (unsigned long)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_slong to_int64() const {
+    return (ap_slong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    return (ap_ulong)(_AP_ROOT_op_get_range(d_bv.V, l_index, h_index));
+  }
+
+  INLINE bool and_reduce() const {
+    bool ret = true;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret &= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+
+  INLINE bool or_reduce() const {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret |= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+
+  INLINE bool xor_reduce() const {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) {
+#ifdef __SYNTHESIS__
+#pragma HLS unroll
+#endif
+      ret ^= _AP_ROOT_op_get_bit(d_bv.V, i);
+    }
+    return ret;
+  }
+#ifndef __SYNTHESIS__
+  std::string to_string(signed char radix = 2) const {
+    ap_int_base<_AP_W, false> ret;
+    ret.V = _AP_ROOT_op_get_range(d_bv.V, l_index, h_index);
+    return ret.to_string(radix);
+  }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string(signed char radix = 2) const {
+    return 0;
+  }
+#endif
+}; // struct ap_range_ref
+
+// XXX apcc cannot handle global std::ios_base::Init() brought in by <iostream>
+#ifndef AP_AUTOCC
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::ostream& operator<<(std::ostream& os,
+                                const ap_range_ref<_AP_W, _AP_S>& x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.to_string(16); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.to_string(8); // don't print sign
+  } else {
+    os << x.to_string(10);
+  }
+  return os;
+}
+#endif // ifndef __SYNTHESIS__
+
+#ifndef __SYNTHESIS__
+template <int _AP_W, bool _AP_S>
+INLINE std::istream& operator>>(std::istream& in,
+                                ap_range_ref<_AP_W, _AP_S>& op) {
+  std::string str;
+  in >> str;
+  op = ap_int_base<_AP_W, _AP_S>(str.c_str());
+  return in;
+}
+#endif // ifndef __SYNTHESIS__
+#endif // ifndef AP_AUTOCC
+
+/* Bit reference.
+   ----------------------------------------------------------------
+*/
+template <int _AP_W, bool _AP_S>
+struct ap_bit_ref {
+  // struct ssdm_int or its sim model.
+  // TODO make it possible to reference to ap_fixed_base/ap_fixed/ap_ufixed
+  //      and then we can retire af_bit_ref.
+  typedef ap_int_base<_AP_W, _AP_S> ref_type;
+  ref_type& d_bv;
+  int d_index;
+
+ public:
+  // copy ctor
+  INLINE ap_bit_ref(const ap_bit_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {}
+
+  INLINE ap_bit_ref(ref_type* bv, int index = 0) : d_bv(*bv), d_index(index) {}
+
+  INLINE ap_bit_ref(const ref_type* bv, int index = 0)
+      : d_bv(*const_cast<ref_type*>(bv)), d_index(index) {}
+
+  INLINE operator bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+  INLINE bool to_bool() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  // assign op from hls supported C integral types.
+  // FIXME disabled to support sc_signal<bool>.
+  // NOTE this used to be unsigned long long.
+  //template <typename T>
+  //INLINE typename _ap_type::enable_if<_ap_type::is_integral<T>::value,
+  //                                    ap_bit_ref&>::type
+  //operator=(T val) {
+  //  d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val);
+  //  return *this;
+  //}
+#define ASSIGN_WITH_CTYPE(_Tp)                          \
+  INLINE ap_bit_ref& operator=(_Tp val) {               \
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index, val); \
+    return *this;                                       \
+  }
+
+  ASSIGN_WITH_CTYPE(bool)
+  ASSIGN_WITH_CTYPE(char)
+  ASSIGN_WITH_CTYPE(signed char)
+  ASSIGN_WITH_CTYPE(unsigned char)
+  ASSIGN_WITH_CTYPE(short)
+  ASSIGN_WITH_CTYPE(unsigned short)
+  ASSIGN_WITH_CTYPE(int)
+  ASSIGN_WITH_CTYPE(unsigned int)
+  ASSIGN_WITH_CTYPE(long)
+  ASSIGN_WITH_CTYPE(unsigned long)
+  ASSIGN_WITH_CTYPE(ap_slong)
+  ASSIGN_WITH_CTYPE(ap_ulong)
+
+#undef ASSIGN_WITH_CTYPE
+
+#define ASSIGN_WITH_CTYPE_FP(_Tp)                           \
+  INLINE ap_bit_ref& operator=(_Tp val) {                   \
+    bool tmp_val = val;                                     \
+    d_bv.V = _AP_ROOT_op_set_bit(d_bv.V, d_index,tmp_val);  \
+    return *this;                                           \
+  }
+
+#if _AP_ENABLE_HALF_ == 1
+  ASSIGN_WITH_CTYPE_FP(half)
+#endif
+  ASSIGN_WITH_CTYPE_FP(float)
+  ASSIGN_WITH_CTYPE_FP(double)
+
+#undef ASSIGN_WITH_CTYPE_FP
+
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_int_base<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(val.V != 0));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_range_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_int_base<_AP_W2, false>)val);
+  }
+
+  // Be explicit to prevent it from being deleted, as field d_bv
+  // is of reference type.
+  INLINE ap_bit_ref& operator=(const ap_bit_ref& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_bit_ref& operator=(const ap_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_bit_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((const ap_int_base<_AP_W2, false>)val);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_bit_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((ap_ulong)(bool)val);
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_bit_ref& operator=(
+      const ap_concat_ref<_AP_W2, _AP_T3, _AP_W3, _AP_T3>& val) {
+    return operator=((const ap_int_base<_AP_W2 + _AP_W3, false>)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, a2);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const ap_int_base<_AP_W2, _AP_S2> &a2) {
+    ap_int_base<_AP_W2, _AP_S2> op(a2);
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >
+  operator,(const volatile ap_int_base<_AP_W2, _AP_S2> &a2) {
+    ap_int_base<_AP_W2, _AP_S2> op(a2);
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_int_base<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_int_base<_AP_W2, _AP_S2>&>(op));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >
+  operator,(const ap_range_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2, ap_range_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_range_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> > operator,(
+      const ap_bit_ref<_AP_W2, _AP_S2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, 1, ap_bit_ref<_AP_W2, _AP_S2> >(
+        *this, const_cast<ap_bit_ref<_AP_W2, _AP_S2>&>(a2));
+  }
+
+  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+  INLINE ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
+                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, _AP_W2 + _AP_W3,
+                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<
+      1, ap_bit_ref, _AP_W2,
+      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<
+        1, ap_bit_ref, _AP_W2,
+        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<
+            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                    _AP_Q2, _AP_O2, _AP_N2> >
+  operator,(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+    return ap_concat_ref<1, ap_bit_ref, 1, af_bit_ref<_AP_W2, _AP_I2, _AP_S2,
+                                                      _AP_Q2, _AP_O2, _AP_N2> >(
+        *this,
+        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+            a2));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_bit_ref<_AP_W2, _AP_S2>& op) {
+    return get() != op.get();
+  }
+
+  INLINE bool get() const { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  INLINE bool get() { return _AP_ROOT_op_get_bit(d_bv.V, d_index); }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_int_base<_AP_W3, false>& val) {
+    operator=(val);
+  }
+
+  INLINE bool operator~() const {
+    bool bit = _AP_ROOT_op_get_bit(d_bv.V, d_index);
+    return bit ? false : true;
+  }
+
+  INLINE int length() const { return 1; }
+
+#ifndef __SYNTHESIS__
+  std::string to_string() const { return get() ? "1" : "0"; }
+#else
+  // XXX HLS will delete this in synthesis
+  INLINE char* to_string() const { return 0; }
+#endif
+}; // struct ap_bit_ref
+
+/* ap_range_ref with int.
+ * ------------------------------------------------------------
+ */
+// equality and relational operators.
+#define REF_REL_OP_WITH_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_range_ref<_AP_W, _AP_S>& op,        \
+                              C_TYPE op2) {                                \
+    return ap_int_base<_AP_W, false>(op)                                   \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_bit_ref<_AP_W, _AP_S>& op,          \
+                              C_TYPE op2) {                                \
+    return bool(op) REL_OP op2;                                            \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(C_TYPE op2,                                  \
+                              const ap_bit_ref<_AP_W, _AP_S>& op) {        \
+    return op2 REL_OP bool(op);                                            \
+  }                                                                        \
+  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
+  INLINE bool operator REL_OP(                                             \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1>& op, C_TYPE op2) { \
+    return ap_int_base<_AP_W + _AP_W1, false>(op)                          \
+        REL_OP ap_int_base<_AP_W2, _AP_S2>(op2);                           \
+  }
+
+// Make the line shorter than 5000 chars
+#define REF_REL_WITH_INT_1(C_TYPE, _AP_WI, _AP_SI) \
+  REF_REL_OP_WITH_INT(>, C_TYPE, _AP_WI, _AP_SI)   \
+  REF_REL_OP_WITH_INT(<, C_TYPE, _AP_WI, _AP_SI)   \
+  REF_REL_OP_WITH_INT(>=, C_TYPE, _AP_WI, _AP_SI)  \
+  REF_REL_OP_WITH_INT(<=, C_TYPE, _AP_WI, _AP_SI)
+
+REF_REL_WITH_INT_1(bool, 1, false)
+REF_REL_WITH_INT_1(char, 8, CHAR_IS_SIGNED)
+REF_REL_WITH_INT_1(signed char, 8, true)
+REF_REL_WITH_INT_1(unsigned char, 8, false)
+REF_REL_WITH_INT_1(short, _AP_SIZE_short, true)
+REF_REL_WITH_INT_1(unsigned short, _AP_SIZE_short, false)
+REF_REL_WITH_INT_1(int, _AP_SIZE_int, true)
+REF_REL_WITH_INT_1(unsigned int, _AP_SIZE_int, false)
+REF_REL_WITH_INT_1(long, _AP_SIZE_long, true)
+REF_REL_WITH_INT_1(unsigned long, _AP_SIZE_long, false)
+REF_REL_WITH_INT_1(ap_slong, _AP_SIZE_ap_slong, true)
+REF_REL_WITH_INT_1(ap_ulong, _AP_SIZE_ap_slong, false)
+
+// Make the line shorter than 5000 chars
+#define REF_REL_WITH_INT_2(C_TYPE, _AP_WI, _AP_SI) \
+  REF_REL_OP_WITH_INT(==, C_TYPE, _AP_WI, _AP_SI)  \
+  REF_REL_OP_WITH_INT(!=, C_TYPE, _AP_WI, _AP_SI)
+
+REF_REL_WITH_INT_2(bool, 1, false)
+REF_REL_WITH_INT_2(char, 8, CHAR_IS_SIGNED)
+REF_REL_WITH_INT_2(signed char, 8, true)
+REF_REL_WITH_INT_2(unsigned char, 8, false)
+REF_REL_WITH_INT_2(short, _AP_SIZE_short, true)
+REF_REL_WITH_INT_2(unsigned short, _AP_SIZE_short, false)
+REF_REL_WITH_INT_2(int, _AP_SIZE_int, true)
+REF_REL_WITH_INT_2(unsigned int, _AP_SIZE_int, false)
+REF_REL_WITH_INT_2(long, _AP_SIZE_long, true)
+REF_REL_WITH_INT_2(unsigned long, _AP_SIZE_long, false)
+REF_REL_WITH_INT_2(ap_slong, _AP_SIZE_ap_slong, true)
+REF_REL_WITH_INT_2(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_REL_OP_WITH_INT
+#undef REF_REL_WITH_INT_1
+#undef REF_REL_WITH_INT_2
+
+#define REF_BIN_OP_WITH_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)          \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE typename ap_int_base<_AP_W, false>::template RType<_AP_W2,         \
+                                                            _AP_S2>::RTYPE  \
+  operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& op, C_TYPE op2) {       \
+    return ap_int_base<_AP_W, false>(op)                                    \
+        BIN_OP ap_int_base<_AP_W2, _AP_S2>(op2);                            \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE typename ap_int_base<_AP_W2, _AP_S2>::template RType<_AP_W,        \
+                                                              false>::RTYPE \
+  operator BIN_OP(C_TYPE op2, const ap_range_ref<_AP_W, _AP_S>& op) {       \
+    return ap_int_base<_AP_W2, _AP_S2>(op2)                                 \
+        BIN_OP ap_int_base<_AP_W, false>(op);                               \
+  }
+
+// arithmetic operators.
+#define REF_BIN_OP_WITH_INT_ARITH(C_TYPE, _AP_W2, _AP_S2)   \
+  REF_BIN_OP_WITH_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_WITH_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_WITH_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_OP_WITH_INT_ARITH(bool, 1, false)
+REF_BIN_OP_WITH_INT_ARITH(char, 8, CHAR_IS_SIGNED)
+REF_BIN_OP_WITH_INT_ARITH(signed char, 8, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned char, 8, false)
+REF_BIN_OP_WITH_INT_ARITH(short, _AP_SIZE_short, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned short, _AP_SIZE_short, false)
+REF_BIN_OP_WITH_INT_ARITH(int, _AP_SIZE_int, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned int, _AP_SIZE_int, false)
+REF_BIN_OP_WITH_INT_ARITH(long, _AP_SIZE_long, true)
+REF_BIN_OP_WITH_INT_ARITH(unsigned long, _AP_SIZE_long, false)
+REF_BIN_OP_WITH_INT_ARITH(ap_slong, _AP_SIZE_ap_slong, true)
+REF_BIN_OP_WITH_INT_ARITH(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_BIN_OP_WITH_INT_ARITH
+
+// bitwise and shift operators
+#define REF_BIN_OP_WITH_INT_BITS(C_TYPE, _AP_W2, _AP_S2)     \
+  REF_BIN_OP_WITH_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_WITH_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_WITH_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_OP_WITH_INT_BITS(bool, 1, false)
+REF_BIN_OP_WITH_INT_BITS(char, 8, CHAR_IS_SIGNED)
+REF_BIN_OP_WITH_INT_BITS(signed char, 8, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned char, 8, false)
+REF_BIN_OP_WITH_INT_BITS(short, _AP_SIZE_short, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned short, _AP_SIZE_short, false)
+REF_BIN_OP_WITH_INT_BITS(int, _AP_SIZE_int, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned int, _AP_SIZE_int, false)
+REF_BIN_OP_WITH_INT_BITS(long, _AP_SIZE_long, true)
+REF_BIN_OP_WITH_INT_BITS(unsigned long, _AP_SIZE_long, false)
+REF_BIN_OP_WITH_INT_BITS(ap_slong, _AP_SIZE_ap_slong, true)
+REF_BIN_OP_WITH_INT_BITS(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef REF_BIN_OP_WITH_INT_BITS
+
+/* ap_range_ref with ap_range_ref
+ *  ------------------------------------------------------------
+ */
+#define REF_BIN_OP(BIN_OP, RTYPE)                                              \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                    \
+  INLINE                                                                       \
+      typename ap_int_base<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
+      operator BIN_OP(const ap_range_ref<_AP_W, _AP_S>& lhs,                   \
+                      const ap_range_ref<_AP_W2, _AP_S2>& rhs) {               \
+    return (lhs.operator ap_int_base<_AP_W, false>())BIN_OP(                   \
+        rhs.operator ap_int_base<_AP_W2, false>());                            \
+  }
+
+REF_BIN_OP(+, plus)
+REF_BIN_OP(-, minus)
+REF_BIN_OP(*, mult)
+REF_BIN_OP(/, div)
+REF_BIN_OP(%, mod)
+REF_BIN_OP(&, logic)
+REF_BIN_OP(|, logic)
+REF_BIN_OP(^, logic)
+REF_BIN_OP(>>, arg1)
+REF_BIN_OP(<<, arg1)
+
+/* ap_concat_ref with ap_concat_ref.
+ *  ------------------------------------------------------------
+ */
+
+//************************************************************************
+//  Implement
+//      ap_int_base<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
+//  for operators  +, -, *, /, %, >>, <<, &, |, ^
+//  Without these operators the operands are converted to int64 and
+//  larger results lose informations (higher order bits).
+//
+//                       operand OP
+//                      /          |
+//              left-concat         right-concat
+//                /     |            /         |
+//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>    <RW2,RT2>
+//
+//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
+//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
+//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
+//
+//  In Verilog 2001 result of concatenation is always unsigned even
+//  when both sides are signed.
+//************************************************************************
+
+#undef SYN_CONCAT_REF_BIN_OP
+
+#define SYN_CONCAT_REF_BIN_OP(BIN_OP, RTYPE)                              \
+  template <int _AP_LW1, typename _AP_LT1, int _AP_LW2, typename _AP_LT2, \
+            int _AP_RW1, typename _AP_RT1, int _AP_RW2, typename _AP_RT2> \
+  INLINE typename ap_int_base<_AP_LW1 + _AP_LW2, false>::template RType<  \
+      _AP_RW1 + _AP_RW2, false>::RTYPE                                    \
+  operator BIN_OP(                                                        \
+      const ap_concat_ref<_AP_LW1, _AP_LT1, _AP_LW2, _AP_LT2>& lhs,       \
+      const ap_concat_ref<_AP_RW1, _AP_RT1, _AP_RW2, _AP_RT2>& rhs) {     \
+    return lhs.get() BIN_OP rhs.get();                                    \
+  }
+
+SYN_CONCAT_REF_BIN_OP(+, plus)
+SYN_CONCAT_REF_BIN_OP(-, minus)
+SYN_CONCAT_REF_BIN_OP(*, mult)
+SYN_CONCAT_REF_BIN_OP(/, div)
+SYN_CONCAT_REF_BIN_OP(%, mod)
+SYN_CONCAT_REF_BIN_OP(&, logic)
+SYN_CONCAT_REF_BIN_OP(|, logic)
+SYN_CONCAT_REF_BIN_OP(^, logic)
+SYN_CONCAT_REF_BIN_OP(>>, arg1)
+SYN_CONCAT_REF_BIN_OP(<<, arg1)
+
+#undef SYN_CONCAT_REF_BIN_OP
+
+#define CONCAT_OP_WITH_INT(C_TYPE, _AP_WI, _AP_SI)                          \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const ap_int_base<_AP_W, _AP_S> &op1, C_TYPE op2) {                   \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    ret <<= _AP_WI;                                                         \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1, const ap_int_base<_AP_W, _AP_S> &op2) {                   \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    if (_AP_S) {                                                            \
+      ret <<= _AP_WI;                                                       \
+      ret >>= _AP_WI;                                                       \
+    }                                                                       \
+    ret |= val << _AP_W;                                                    \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const ap_range_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    ret <<= _AP_WI;                                                         \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1, const ap_range_ref<_AP_W, _AP_S> &op2) {                  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
+      const ap_bit_ref<_AP_W, _AP_S> &op1, C_TYPE op2) {                    \
+    ap_int_base<_AP_WI + 1, false> val(op2);                                \
+    val[_AP_WI] = op1;                                                      \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE ap_int_base<_AP_WI + 1, false> operator,(                          \
+      C_TYPE op1, const ap_bit_ref<_AP_W, _AP_S> &op2) {                    \
+    ap_int_base<_AP_WI + 1, false> val(op1);                                \
+    val <<= 1;                                                              \
+    val[0] = op2;                                                           \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
+  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op1, C_TYPE op2) { \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op2);                  \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op1);                  \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W + _AP_W2;                                               \
+      val >>= _AP_W + _AP_W2;                                               \
+    }                                                                       \
+    ret <<= _AP_WI;                                                         \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, typename _AP_T, int _AP_W2, typename _AP_T2>         \
+  INLINE ap_int_base<_AP_W + _AP_W2 + _AP_WI, false> operator,(             \
+      C_TYPE op1, const ap_concat_ref<_AP_W, _AP_T, _AP_W2, _AP_T2> &op2) { \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> val(op1);                  \
+    ap_int_base<_AP_WI + _AP_W + _AP_W2, _AP_SI> ret(op2);                  \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,    \
+      C_TYPE op2) {                                                         \
+    ap_int_base<_AP_WI + _AP_W, false> val(op2);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op1);                            \
+    if (_AP_SI) {                                                           \
+      val <<= _AP_W;                                                        \
+      val >>= _AP_W;                                                        \
+    }                                                                       \
+    ret <<= _AP_WI;                                                         \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<_AP_W + _AP_WI, false> operator,(                      \
+      C_TYPE op1,                                                           \
+      const af_range_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {  \
+    ap_int_base<_AP_WI + _AP_W, false> val(op1);                            \
+    ap_int_base<_AP_WI + _AP_W, false> ret(op2);                            \
+    int len = op2.length();                                                 \
+    val <<= len;                                                            \
+    ret |= val;                                                             \
+    return ret;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op1,      \
+      C_TYPE op2) {                                                         \
+    ap_int_base<_AP_WI + 1, _AP_SI> val(op2);                               \
+    val[_AP_WI] = op1;                                                      \
+    return val;                                                             \
+  }                                                                         \
+  template <int _AP_W, int _AP_I, bool _AP_S, ap_q_mode _AP_Q,              \
+            ap_o_mode _AP_O, int _AP_N>                                     \
+  INLINE ap_int_base<1 + _AP_WI, false> operator,(                          \
+      C_TYPE op1,                                                           \
+      const af_bit_ref<_AP_W, _AP_I, _AP_S, _AP_Q, _AP_O, _AP_N> &op2) {    \
+    ap_int_base<_AP_WI + 1, _AP_SI> val(op1);                               \
+    val <<= 1;                                                              \
+    val[0] = op2;                                                           \
+    return val;                                                             \
+  }
+
+CONCAT_OP_WITH_INT(bool, 1, false)
+CONCAT_OP_WITH_INT(char, 8, CHAR_IS_SIGNED)
+CONCAT_OP_WITH_INT(signed char, 8, true)
+CONCAT_OP_WITH_INT(unsigned char, 8, false)
+CONCAT_OP_WITH_INT(short, _AP_SIZE_short, true)
+CONCAT_OP_WITH_INT(unsigned short, _AP_SIZE_short, false)
+CONCAT_OP_WITH_INT(int, _AP_SIZE_int, true)
+CONCAT_OP_WITH_INT(unsigned int, _AP_SIZE_int, false)
+CONCAT_OP_WITH_INT(long, _AP_SIZE_long, true)
+CONCAT_OP_WITH_INT(unsigned long, _AP_SIZE_long, false)
+CONCAT_OP_WITH_INT(ap_slong, _AP_SIZE_ap_slong, true)
+CONCAT_OP_WITH_INT(ap_ulong, _AP_SIZE_ap_slong, false)
+
+#undef CONCAT_OP_WITH_INT
+
+#define CONCAT_SHIFT_WITH_INT(C_TYPE, OP)                                  \
+  template <int _AP_W, typename _AP_T, int _AP_W1, typename _AP_T1>        \
+  INLINE ap_uint<_AP_W + _AP_W1> operator OP(                              \
+      const ap_concat_ref<_AP_W, _AP_T, _AP_W1, _AP_T1> lhs, C_TYPE rhs) { \
+    return ap_uint<_AP_W + _AP_W1>(lhs).get() OP int(rhs);                 \
+  }
+
+// FIXME int(rhs) may loose precision.
+
+CONCAT_SHIFT_WITH_INT(int, <<)
+CONCAT_SHIFT_WITH_INT(unsigned int, <<)
+CONCAT_SHIFT_WITH_INT(long, <<)
+CONCAT_SHIFT_WITH_INT(unsigned long, <<)
+CONCAT_SHIFT_WITH_INT(ap_slong, <<)
+CONCAT_SHIFT_WITH_INT(ap_ulong, <<)
+
+CONCAT_SHIFT_WITH_INT(int, >>)
+CONCAT_SHIFT_WITH_INT(unsigned int, >>)
+CONCAT_SHIFT_WITH_INT(long, >>)
+CONCAT_SHIFT_WITH_INT(unsigned long, >>)
+CONCAT_SHIFT_WITH_INT(ap_slong, >>)
+CONCAT_SHIFT_WITH_INT(ap_ulong, >>)
+
+#endif // ifndef __cplusplus
+#endif // ifndef __AP_INT_REF_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/ap_int_special.h b/hls4ml/templates/vivado/ap_types/ap_int_special.h
index a80a851854..3afc6192ba 100644
--- a/hls4ml/templates/vivado/ap_types/ap_int_special.h
+++ b/hls4ml/templates/vivado/ap_types/ap_int_special.h
@@ -1,223 +1,223 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_INT_SPECIAL_H__
-#define __AP_INT_SPECIAL_H__
-
-#ifndef __AP_INT_H__
-#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
-#endif
-
-#ifndef __SYNTHESIS__
-#include <cstdio>
-#include <cstdlib>
-#endif
-// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
-// include.
-// #include <complex>
-namespace std {
-template<typename _Tp> class complex;
-}
-
-/*
-  TODO: Modernize the code using C++11/C++14
-  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
-  2. move constructor
-*/
-
-namespace std {
-/*
-   Specialize std::complex<ap_int> to zero initialization ap_int.
-
-   To reduce the area cost, ap_int is not zero initialized, just like basic
-   types float or double. However, libstdc++ provides specialization for float,
-   double and long double, initializing image part to 0 when not specified.
-
-   This has become a difficulty in switching legacy code from these C types to
-   ap_int. To ease the tranform of legacy code, we have to implement
-   specialization of std::complex<> for our type.
-
-   As ap_int is a template, it is impossible to specialize only the methods
-   that causes default initialization of value type in std::complex<>. An
-   explicit full specialization of the template class has to be done, covering
-   all the member functions and operators of std::complex<> as specified
-   in standard 26.2.4 and 26.2.5.
-*/
-template <int _AP_W>
-class complex<ap_int<_AP_W> > {
- public:
-  typedef ap_int<_AP_W> _Tp;
-  typedef _Tp value_type;
-
-  // 26.2.4/1
-  // Constructor without argument
-  // Default initialize, so that in dataflow, the variable is only written once.
-  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
-  // Constructor with ap_int.
-  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
-  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
-      : _M_real(__r), _M_imag(__i) {}
-
-  // Constructor with another complex number
-  template <typename _Up>
-  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
-
-#if __cplusplus >= 201103L
-  const _Tp& real() const { return _M_real; }
-  const _Tp& imag() const { return _M_imag; }
-#else
-  _Tp& real() { return _M_real; }
-  const _Tp& real() const { return _M_real; }
-  _Tp& imag() { return _M_imag; }
-  const _Tp& imag() const { return _M_imag; }
-#endif
-
-  void real(_Tp __val) { _M_real = __val; }
-
-  void imag(_Tp __val) { _M_imag = __val; }
-
-  // Assign this complex number with ap_int.
-  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
-  complex<_Tp> &operator=(const _Tp __t) {
-    _M_real = __t;
-    _M_imag = _Tp(0);
-    return *this;
-  }
-
-  // 26.2.5/1
-  // Add ap_int to this complex number.
-  complex<_Tp> &operator+=(const _Tp &__t) {
-    _M_real += __t;
-    return *this;
-  }
-
-  // 26.2.5/3
-  // Subtract ap_int from this complex number.
-  complex<_Tp> &operator-=(const _Tp &__t) {
-    _M_real -= __t;
-    return *this;
-  }
-
-  // 26.2.5/5
-  // Multiply this complex number by ap_int.
-  complex<_Tp> &operator*=(const _Tp &__t) {
-    _M_real *= __t;
-    _M_imag *= __t;
-    return *this;
-  }
-
-  // 26.2.5/7
-  // Divide this complex number by ap_int.
-  complex<_Tp> &operator/=(const _Tp &__t) {
-    _M_real /= __t;
-    _M_imag /= __t;
-    return *this;
-  }
-
-  // Assign complex number to this complex number.
-  template <typename _Up>
-  complex<_Tp> &operator=(const complex<_Up> &__z) {
-    _M_real = __z.real();
-    _M_imag = __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/9
-  // Add complex number to this.
-  template <typename _Up>
-  complex<_Tp> &operator+=(const complex<_Up> &__z) {
-    _M_real += __z.real();
-    _M_imag += __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/11
-  // Subtract complex number from this.
-  template <typename _Up>
-  complex<_Tp> &operator-=(const complex<_Up> &__z) {
-    _M_real -= __z.real();
-    _M_imag -= __z.imag();
-    return *this;
-  }
-
-  // 26.2.5/13
-  // Multiply this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator*=(const complex<_Up> &__z) {
-    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
-    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
-    _M_real = __r;
-    return *this;
-  }
-
-  // 26.2.5/15
-  // Divide this by complex number.
-  template <typename _Up>
-  complex<_Tp> &operator/=(const complex<_Up> &__z) {
-    complex<_Tp> cj (__z.real(), -__z.imag());
-    complex<_Tp> a = (*this) * cj;
-    complex<_Tp> b = cj * __z;
-    _M_real = a.real() / b.real();
-    _M_imag = a.imag() / b.real();
-    return *this;
-  }
-
- private:
-  _Tp _M_real;
-  _Tp _M_imag;
-
-}; // class complex<ap_int<_AP_W> >
-
-
-/*
-   Non-member operations
-   These operations are not required by standard in 26.2.6, but libstdc++
-   defines them for
-   float, double or long double's specialization.
-*/
-// Compare complex number with ap_int.
-template <int _AP_W>
-inline bool operator==(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
-  return __x.real() == __y &&
-         __x.imag() == 0;
-}
-
-// Compare ap_int with complex number.
-template <int _AP_W>
-inline bool operator==(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
-  return __x == __y.real() &&
-         0 == __y.imag();
-}
-
-// Compare complex number with ap_int.
-template <int _AP_W>
-inline bool operator!=(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
-  return __x.real() != __y ||
-         __x.imag() != 0;
-}
-
-// Compare ap_int with complex number.
-template <int _AP_W>
-inline bool operator!=(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
-  return __x != __y.real() ||
-         0 != __y.imag();
-}
-
-}  // namespace std
-
-#endif  // ifndef __AP_INT_SPECIAL_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_INT_SPECIAL_H__
+#define __AP_INT_SPECIAL_H__
+
+#ifndef __AP_INT_H__
+#error "Only ap_fixed.h and ap_int.h can be included directly in user code."
+#endif
+
+#ifndef __SYNTHESIS__
+#include <cstdio>
+#include <cstdlib>
+#endif
+// FIXME AP_AUTOCC cannot handle many standard headers, so declare instead of
+// include.
+// #include <complex>
+namespace std {
+template<typename _Tp> class complex;
+}
+
+/*
+  TODO: Modernize the code using C++11/C++14
+  1. constexpr http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2016/p0415r0.html
+  2. move constructor
+*/
+
+namespace std {
+/*
+   Specialize std::complex<ap_int> to zero initialization ap_int.
+
+   To reduce the area cost, ap_int is not zero initialized, just like basic
+   types float or double. However, libstdc++ provides specialization for float,
+   double and long double, initializing image part to 0 when not specified.
+
+   This has become a difficulty in switching legacy code from these C types to
+   ap_int. To ease the tranform of legacy code, we have to implement
+   specialization of std::complex<> for our type.
+
+   As ap_int is a template, it is impossible to specialize only the methods
+   that causes default initialization of value type in std::complex<>. An
+   explicit full specialization of the template class has to be done, covering
+   all the member functions and operators of std::complex<> as specified
+   in standard 26.2.4 and 26.2.5.
+*/
+template <int _AP_W>
+class complex<ap_int<_AP_W> > {
+ public:
+  typedef ap_int<_AP_W> _Tp;
+  typedef _Tp value_type;
+
+  // 26.2.4/1
+  // Constructor without argument
+  // Default initialize, so that in dataflow, the variable is only written once.
+  complex() : _M_real(_Tp()), _M_imag(_Tp()) {}
+  // Constructor with ap_int.
+  // Zero initialize image part when not specified, so that `C(1) == C(1,0)`
+  complex(const _Tp &__r, const _Tp &__i = _Tp(0))
+      : _M_real(__r), _M_imag(__i) {}
+
+  // Constructor with another complex number
+  template <typename _Up>
+  complex(const complex<_Up> &__z) : _M_real(__z.real()), _M_imag(__z.imag()) {}
+
+#if __cplusplus >= 201103L
+  const _Tp& real() const { return _M_real; }
+  const _Tp& imag() const { return _M_imag; }
+#else
+  _Tp& real() { return _M_real; }
+  const _Tp& real() const { return _M_real; }
+  _Tp& imag() { return _M_imag; }
+  const _Tp& imag() const { return _M_imag; }
+#endif
+
+  void real(_Tp __val) { _M_real = __val; }
+
+  void imag(_Tp __val) { _M_imag = __val; }
+
+  // Assign this complex number with ap_int.
+  // Zero initialize image poarrt, so that `C c; c = 1; c == C(1,0);`
+  complex<_Tp> &operator=(const _Tp __t) {
+    _M_real = __t;
+    _M_imag = _Tp(0);
+    return *this;
+  }
+
+  // 26.2.5/1
+  // Add ap_int to this complex number.
+  complex<_Tp> &operator+=(const _Tp &__t) {
+    _M_real += __t;
+    return *this;
+  }
+
+  // 26.2.5/3
+  // Subtract ap_int from this complex number.
+  complex<_Tp> &operator-=(const _Tp &__t) {
+    _M_real -= __t;
+    return *this;
+  }
+
+  // 26.2.5/5
+  // Multiply this complex number by ap_int.
+  complex<_Tp> &operator*=(const _Tp &__t) {
+    _M_real *= __t;
+    _M_imag *= __t;
+    return *this;
+  }
+
+  // 26.2.5/7
+  // Divide this complex number by ap_int.
+  complex<_Tp> &operator/=(const _Tp &__t) {
+    _M_real /= __t;
+    _M_imag /= __t;
+    return *this;
+  }
+
+  // Assign complex number to this complex number.
+  template <typename _Up>
+  complex<_Tp> &operator=(const complex<_Up> &__z) {
+    _M_real = __z.real();
+    _M_imag = __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/9
+  // Add complex number to this.
+  template <typename _Up>
+  complex<_Tp> &operator+=(const complex<_Up> &__z) {
+    _M_real += __z.real();
+    _M_imag += __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/11
+  // Subtract complex number from this.
+  template <typename _Up>
+  complex<_Tp> &operator-=(const complex<_Up> &__z) {
+    _M_real -= __z.real();
+    _M_imag -= __z.imag();
+    return *this;
+  }
+
+  // 26.2.5/13
+  // Multiply this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator*=(const complex<_Up> &__z) {
+    const _Tp __r = _M_real * __z.real() - _M_imag * __z.imag();
+    _M_imag = _M_real * __z.imag() + _M_imag * __z.real();
+    _M_real = __r;
+    return *this;
+  }
+
+  // 26.2.5/15
+  // Divide this by complex number.
+  template <typename _Up>
+  complex<_Tp> &operator/=(const complex<_Up> &__z) {
+    complex<_Tp> cj (__z.real(), -__z.imag());
+    complex<_Tp> a = (*this) * cj;
+    complex<_Tp> b = cj * __z;
+    _M_real = a.real() / b.real();
+    _M_imag = a.imag() / b.real();
+    return *this;
+  }
+
+ private:
+  _Tp _M_real;
+  _Tp _M_imag;
+
+}; // class complex<ap_int<_AP_W> >
+
+
+/*
+   Non-member operations
+   These operations are not required by standard in 26.2.6, but libstdc++
+   defines them for
+   float, double or long double's specialization.
+*/
+// Compare complex number with ap_int.
+template <int _AP_W>
+inline bool operator==(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
+  return __x.real() == __y &&
+         __x.imag() == 0;
+}
+
+// Compare ap_int with complex number.
+template <int _AP_W>
+inline bool operator==(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
+  return __x == __y.real() &&
+         0 == __y.imag();
+}
+
+// Compare complex number with ap_int.
+template <int _AP_W>
+inline bool operator!=(const complex<ap_int<_AP_W> > &__x, const ap_int<_AP_W> &__y) {
+  return __x.real() != __y ||
+         __x.imag() != 0;
+}
+
+// Compare ap_int with complex number.
+template <int _AP_W>
+inline bool operator!=(const ap_int<_AP_W> &__x, const complex<ap_int<_AP_W> > &__y) {
+  return __x != __y.real() ||
+         0 != __y.imag();
+}
+
+}  // namespace std
+
+#endif  // ifndef __AP_INT_SPECIAL_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/etc/ap_private.h b/hls4ml/templates/vivado/ap_types/etc/ap_private.h
index 7af898d2c5..0c29a0ac1a 100644
--- a/hls4ml/templates/vivado/ap_types/etc/ap_private.h
+++ b/hls4ml/templates/vivado/ap_types/etc/ap_private.h
@@ -1,7199 +1,7199 @@
-/*
- * Copyright 2011-2019 Xilinx, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __AP_PRIVATE_H__
-#define __AP_PRIVATE_H__
-
-// common macros and type declarations are now defined in ap_common.h, and
-// ap_private becomes part of it.
-#ifndef __AP_COMMON_H__
-#error "etc/ap_private.h cannot be included directly."
-#endif
-
-// forward declarations
-//template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
-//class ap_private; // moved to ap_common.h
-template <int _AP_W, bool _AP_S>
-struct _private_range_ref;
-template <int _AP_W, bool _AP_S>
-struct _private_bit_ref;
-
-// TODO clean up this part.
-#ifndef LLVM_SUPPORT_MATHEXTRAS_H
-#define LLVM_SUPPORT_MATHEXTRAS_H
-
-#ifdef _MSC_VER
-#if _MSC_VER <= 1500
-typedef __int8 int8_t;
-typedef unsigned __int8 uint8_t;
-typedef __int16 int16_t;
-typedef unsigned __int16 uint16_t;
-typedef __int32 int32_t;
-typedef unsigned __int32 uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-#else
-#include <stdint.h>
-#endif
-#else
-#include <stdint.h>
-#endif
-
-#ifndef INLINE
-#define INLINE inline
-// Enable to debug ap_int/ap_fixed
-// #define INLINE  __attribute__((weak))
-#endif
-
-// NOTE: The following support functions use the _32/_64 extensions instead of
-// type overloading so that signed and unsigned integers can be used without
-// ambiguity.
-namespace AESL_std {
-template <class DataType>
-DataType INLINE min(DataType a, DataType b) {
-  return (a >= b) ? b : a;
-}
-
-template <class DataType>
-DataType INLINE max(DataType a, DataType b) {
-  return (a >= b) ? a : b;
-}
-} // namespace AESL_std
-
-// TODO clean up included headers.
-#include <math.h>
-#include <stdio.h>
-#include <cassert>
-#include <cstdlib>
-#include <cstring>
-#include <iomanip>
-#include <limits>
-#include <sstream>
-#include <string>
-
-namespace ap_private_ops {
-/// Hi_32 - This function returns the high 32 bits of a 64 bit value.
-static INLINE uint32_t Hi_32(uint64_t Value) {
-  return static_cast<uint32_t>(Value >> 32);
-}
-
-/// Lo_32 - This function returns the low 32 bits of a 64 bit value.
-static INLINE uint32_t Lo_32(uint64_t Value) {
-  return static_cast<uint32_t>(Value);
-}
-
-template <int _AP_W>
-INLINE bool isNegative(const ap_private<_AP_W, false>& a) {
-  return false;
-}
-
-template <int _AP_W>
-INLINE bool isNegative(const ap_private<_AP_W, true>& a) {
-  enum {
-    APINT_BITS_PER_WORD = 64,
-    _AP_N = (_AP_W + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD
-  };
-  static const uint64_t sign_mask = 1ULL << ((_AP_W - 1) % APINT_BITS_PER_WORD);
-  return (sign_mask & a.get_pVal(_AP_N - 1)) != 0;
-}
-
-/// CountLeadingZeros_32 - this function performs the platform optimal form of
-/// counting the number of zeros from the most significant bit to the first one
-/// bit.  Ex. CountLeadingZeros_32(0x00F000FF) == 8.
-/// Returns 32 if the word is zero.
-static INLINE unsigned CountLeadingZeros_32(uint32_t Value) {
-  unsigned Count; // result
-#if __GNUC__ >= 4
-// PowerPC is defined for __builtin_clz(0)
-#if !defined(__ppc__) && !defined(__ppc64__)
-  if (Value == 0) return 32;
-#endif
-  Count = __builtin_clz(Value);
-#else
-  if (Value == 0) return 32;
-  Count = 0;
-  // bisecton method for count leading zeros
-  for (unsigned Shift = 32 >> 1; Shift; Shift >>= 1) {
-    uint32_t Tmp = (Value) >> (Shift);
-    if (Tmp) {
-      Value = Tmp;
-    } else {
-      Count |= Shift;
-    }
-  }
-#endif
-  return Count;
-}
-
-/// CountLeadingZeros_64 - This function performs the platform optimal form
-/// of counting the number of zeros from the most significant bit to the first
-/// one bit (64 bit edition.)
-/// Returns 64 if the word is zero.
-static INLINE unsigned CountLeadingZeros_64(uint64_t Value) {
-  unsigned Count; // result
-#if __GNUC__ >= 4
-// PowerPC is defined for __builtin_clzll(0)
-#if !defined(__ppc__) && !defined(__ppc64__)
-  if (!Value) return 64;
-#endif
-  Count = __builtin_clzll(Value);
-#else
-  if (sizeof(long) == sizeof(int64_t)) {
-    if (!Value) return 64;
-    Count = 0;
-    // bisecton method for count leading zeros
-    for (unsigned Shift = 64 >> 1; Shift; Shift >>= 1) {
-      uint64_t Tmp = (Value) >> (Shift);
-      if (Tmp) {
-        Value = Tmp;
-      } else {
-        Count |= Shift;
-      }
-    }
-  } else {
-    // get hi portion
-    uint32_t Hi = Hi_32(Value);
-
-    // if some bits in hi portion
-    if (Hi) {
-      // leading zeros in hi portion plus all bits in lo portion
-      Count = CountLeadingZeros_32(Hi);
-    } else {
-      // get lo portion
-      uint32_t Lo = Lo_32(Value);
-      // same as 32 bit value
-      Count = CountLeadingZeros_32(Lo) + 32;
-    }
-  }
-#endif
-  return Count;
-}
-
-/// CountTrailingZeros_64 - This function performs the platform optimal form
-/// of counting the number of zeros from the least significant bit to the first
-/// one bit (64 bit edition.)
-/// Returns 64 if the word is zero.
-static INLINE unsigned CountTrailingZeros_64(uint64_t Value) {
-#if __GNUC__ >= 4
-  return (Value != 0) ? __builtin_ctzll(Value) : 64;
-#else
-  static const unsigned Mod67Position[] = {
-      64, 0,  1,  39, 2,  15, 40, 23, 3,  12, 16, 59, 41, 19, 24, 54, 4,
-      64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55, 47, 5,  32,
-      65, 38, 14, 22, 11, 58, 18, 53, 63, 9,  61, 27, 29, 50, 43, 46, 31,
-      37, 21, 57, 52, 8,  26, 49, 45, 36, 56, 7,  48, 35, 6,  34, 33, 0};
-  return Mod67Position[(uint64_t)(-(int64_t)Value & (int64_t)Value) % 67];
-#endif
-}
-
-/// CountPopulation_64 - this function counts the number of set bits in a value,
-/// (64 bit edition.)
-static INLINE unsigned CountPopulation_64(uint64_t Value) {
-#if __GNUC__ >= 4
-  return __builtin_popcountll(Value);
-#else
-  uint64_t v = Value - (((Value) >> 1) & 0x5555555555555555ULL);
-  v = (v & 0x3333333333333333ULL) + (((v) >> 2) & 0x3333333333333333ULL);
-  v = (v + ((v) >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
-  return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
-#endif
-}
-
-static INLINE uint32_t countLeadingOnes_64(uint64_t __V, uint32_t skip) {
-  uint32_t Count = 0;
-  if (skip) (__V) <<= (skip);
-  while (__V && (__V & (1ULL << 63))) {
-    Count++;
-    (__V) <<= 1;
-  }
-  return Count;
-}
-
-static INLINE std::string oct2Bin(char oct) {
-  switch (oct) {
-    case '\0': {
-      return "";
-    }
-    case '.': {
-      return ".";
-    }
-    case '0': {
-      return "000";
-    }
-    case '1': {
-      return "001";
-    }
-    case '2': {
-      return "010";
-    }
-    case '3': {
-      return "011";
-    }
-    case '4': {
-      return "100";
-    }
-    case '5': {
-      return "101";
-    }
-    case '6': {
-      return "110";
-    }
-    case '7': {
-      return "111";
-    }
-  }
-  assert(0 && "Invalid character in digit string");
-  return "";
-}
-
-static INLINE std::string hex2Bin(char hex) {
-  switch (hex) {
-    case '\0': {
-      return "";
-    }
-    case '.': {
-      return ".";
-    }
-    case '0': {
-      return "0000";
-    }
-    case '1': {
-      return "0001";
-    }
-    case '2': {
-      return "0010";
-    }
-    case '3': {
-      return "0011";
-    }
-    case '4': {
-      return "0100";
-    }
-    case '5': {
-      return "0101";
-    }
-    case '6': {
-      return "0110";
-    }
-    case '7': {
-      return "0111";
-    }
-    case '8': {
-      return "1000";
-    }
-    case '9': {
-      return "1001";
-    }
-    case 'A':
-    case 'a': {
-      return "1010";
-    }
-    case 'B':
-    case 'b': {
-      return "1011";
-    }
-    case 'C':
-    case 'c': {
-      return "1100";
-    }
-    case 'D':
-    case 'd': {
-      return "1101";
-    }
-    case 'E':
-    case 'e': {
-      return "1110";
-    }
-    case 'F':
-    case 'f': {
-      return "1111";
-    }
-  }
-  assert(0 && "Invalid character in digit string");
-  return "";
-}
-
-static INLINE uint32_t decode_digit(char cdigit, int radix) {
-  uint32_t digit = 0;
-  if (radix == 16) {
-#define isxdigit(c)                                            \
-  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
-   ((c) >= 'A' && (c) <= 'F'))
-#define isdigit(c) ((c) >= '0' && (c) <= '9')
-    if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
-    if (isdigit(cdigit))
-      digit = cdigit - '0';
-    else if (cdigit >= 'a')
-      digit = cdigit - 'a' + 10;
-    else if (cdigit >= 'A')
-      digit = cdigit - 'A' + 10;
-    else
-      assert(0 && "huh? we shouldn't get here");
-  } else if (isdigit(cdigit)) {
-    digit = cdigit - '0';
-  } else {
-    assert(0 && "Invalid character in digit string");
-  }
-#undef isxdigit
-#undef isdigit
-  return digit;
-}
-
-// Determine the radix of "val".
-static INLINE std::string parseString(const std::string& input, unsigned char& radix) {
-  size_t len = input.length();
-  if (len == 0) {
-    if (radix == 0) radix = 10;
-    return input;
-  }
-
-  size_t startPos = 0;
-  // Trim whitespace
-  while (input[startPos] == ' ' && startPos < len) startPos++;
-  while (input[len - 1] == ' ' && startPos < len) len--;
-
-  std::string val = input.substr(startPos, len - startPos);
-  // std::cout << "val = " << val << "\n";
-  len = val.length();
-  startPos = 0;
-
-  // If the length of the string is less than 2, then radix
-  // is decimal and there is no exponent.
-  if (len < 2) {
-    if (radix == 0) radix = 10;
-    return val;
-  }
-
-  bool isNegative = false;
-  std::string ans;
-
-  // First check to see if we start with a sign indicator
-  if (val[0] == '-') {
-    ans = "-";
-    ++startPos;
-    isNegative = true;
-  } else if (val[0] == '+')
-    ++startPos;
-
-  if (len - startPos < 2) {
-    if (radix == 0) radix = 10;
-    return val;
-  }
-
-  if (val.substr(startPos, 2) == "0x" || val.substr(startPos, 2) == "0X") {
-    // If we start with "0x", then the radix is hex.
-    radix = 16;
-    startPos += 2;
-  } else if (val.substr(startPos, 2) == "0b" ||
-             val.substr(startPos, 2) == "0B") {
-    // If we start with "0b", then the radix is binary.
-    radix = 2;
-    startPos += 2;
-  } else if (val.substr(startPos, 2) == "0o" ||
-             val.substr(startPos, 2) == "0O") {
-    // If we start with "0o", then the radix is octal.
-    radix = 8;
-    startPos += 2;
-  } else if (radix == 0) {
-    radix = 10;
-  }
-
-  int exp = 0;
-  if (radix == 10) {
-    // If radix is decimal, then see if there is an
-    // exponent indicator.
-    size_t expPos = val.find('e');
-    bool has_exponent = true;
-    if (expPos == std::string::npos) expPos = val.find('E');
-    if (expPos == std::string::npos) {
-      // No exponent indicator, so the mantissa goes to the end.
-      expPos = len;
-      has_exponent = false;
-    }
-    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
-
-    ans += val.substr(startPos, expPos - startPos);
-    if (has_exponent) {
-      // Parse the exponent.
-      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
-      iss >> exp;
-    }
-  } else {
-    // Check for a binary exponent indicator.
-    size_t expPos = val.find('p');
-    bool has_exponent = true;
-    if (expPos == std::string::npos) expPos = val.find('P');
-    if (expPos == std::string::npos) {
-      // No exponent indicator, so the mantissa goes to the end.
-      expPos = len;
-      has_exponent = false;
-    }
-
-    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
-
-    assert(startPos <= expPos);
-    // Convert to binary as we go.
-    for (size_t i = startPos; i < expPos; ++i) {
-      if (radix == 16) {
-        ans += hex2Bin(val[i]);
-      } else if (radix == 8) {
-        ans += oct2Bin(val[i]);
-      } else { // radix == 2
-        ans += val[i];
-      }
-    }
-    // End in binary
-    radix = 2;
-    if (has_exponent) {
-      // Parse the exponent.
-      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
-      iss >> exp;
-    }
-  }
-  if (exp == 0) return ans;
-
-  size_t decPos = ans.find('.');
-  if (decPos == std::string::npos) decPos = ans.length();
-  if ((int)decPos + exp >= (int)ans.length()) {
-    int i = decPos;
-    for (; i < (int)ans.length() - 1; ++i) ans[i] = ans[i + 1];
-    for (; i < (int)ans.length(); ++i) ans[i] = '0';
-    for (; i < (int)decPos + exp; ++i) ans += '0';
-    return ans;
-  } else if ((int)decPos + exp < (int)isNegative) {
-    std::string dupAns = "0.";
-    if (ans[0] == '-') dupAns = "-0.";
-    for (int i = 0; i < isNegative - (int)decPos - exp; ++i) dupAns += '0';
-    for (size_t i = isNegative; i < ans.length(); ++i)
-      if (ans[i] != '.') dupAns += ans[i];
-    return dupAns;
-  }
-
-  if (exp > 0)
-    for (size_t i = decPos; i < decPos + exp; ++i) ans[i] = ans[i + 1];
-  else {
-    if (decPos == ans.length()) ans += ' ';
-    for (int i = decPos; i > (int)decPos + exp; --i) ans[i] = ans[i - 1];
-  }
-  ans[decPos + exp] = '.';
-  return ans;
-}
-
-/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from
-/// the multi-digit integer array, x[], propagating the borrowed 1 value until
-/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
-/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
-/// In other words, if y > x then this function returns 1, otherwise 0.
-/// @returns the borrow out of the subtraction
-static INLINE bool sub_1(uint64_t x[], uint32_t len, uint64_t y) {
-  for (uint32_t i = 0; i < len; ++i) {
-    uint64_t __X = x[i];
-    x[i] -= y;
-    if (y > __X)
-      y = 1; // We have to "borrow 1" from next "digit"
-    else {
-      y = 0; // No need to borrow
-      break; // Remaining digits are unchanged so exit early
-    }
-  }
-  return (y != 0);
-}
-
-/// add_1 - This function adds a single "digit" integer, y, to the multiple
-/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
-/// 1 is returned if there is a carry out, otherwise 0 is returned.
-/// @returns the carry of the addition.
-static INLINE bool add_1(uint64_t dest[], uint64_t x[], uint32_t len,
-                         uint64_t y) {
-  for (uint32_t i = 0; i < len; ++i) {
-    dest[i] = y + x[i];
-    if (dest[i] < y)
-      y = 1; // Carry one to next digit.
-    else {
-      y = 0; // No need to carry so exit early
-      break;
-    }
-  }
-  return (y != 0);
-}
-
-/// add - This function adds the integer array x to the integer array Y and
-/// places the result in dest.
-/// @returns the carry out from the addition
-/// @brief General addition of 64-bit integer arrays
-static INLINE bool add(uint64_t* dest, const uint64_t* x, const uint64_t* y,
-                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
-                       bool xsigned, bool ysigned) {
-  bool carry = false;
-  uint32_t len = AESL_std::min(xlen, ylen);
-  uint32_t i;
-  for (i = 0; i < len && i < destlen; ++i) {
-    uint64_t limit =
-        AESL_std::min(x[i], y[i]); // must come first in case dest == x
-    dest[i] = x[i] + y[i] + carry;
-    carry = dest[i] < limit || (carry && dest[i] == limit);
-  }
-  if (xlen > ylen) {
-    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
-    for (i = ylen; i < xlen && i < destlen; i++) {
-      uint64_t limit = AESL_std::min(x[i], yext);
-      dest[i] = x[i] + yext + carry;
-      carry = (dest[i] < limit) || (carry && dest[i] == limit);
-    }
-  } else if (ylen > xlen) {
-    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
-    for (i = xlen; i < ylen && i < destlen; i++) {
-      uint64_t limit = AESL_std::min(xext, y[i]);
-      dest[i] = xext + y[i] + carry;
-      carry = (dest[i] < limit) || (carry && dest[i] == limit);
-    }
-  }
-  return carry;
-}
-
-/// @returns returns the borrow out.
-/// @brief Generalized subtraction of 64-bit integer arrays.
-static INLINE bool sub(uint64_t* dest, const uint64_t* x, const uint64_t* y,
-                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
-                       bool xsigned, bool ysigned) {
-  bool borrow = false;
-  uint32_t i;
-  uint32_t len = AESL_std::min(xlen, ylen);
-  for (i = 0; i < len && i < destlen; ++i) {
-    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
-    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
-    dest[i] = x_tmp - y[i];
-  }
-  if (xlen > ylen) {
-    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
-    for (i = ylen; i < xlen && i < destlen; i++) {
-      uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
-      borrow = yext > x_tmp || (borrow && x[i] == 0);
-      dest[i] = x_tmp - yext;
-    }
-  } else if (ylen > xlen) {
-    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
-    for (i = xlen; i < ylen && i < destlen; i++) {
-      uint64_t x_tmp = borrow ? xext - 1 : xext;
-      borrow = y[i] > x_tmp || (borrow && xext == 0);
-      dest[i] = x_tmp - y[i];
-    }
-  }
-  return borrow;
-}
-
-/// Subtracts the RHS ap_private from this ap_private
-/// @returns this, after subtraction
-/// @brief Subtraction assignment operator.
-
-/// Multiplies an integer array, x by a a uint64_t integer and places the result
-/// into dest.
-/// @returns the carry out of the multiplication.
-/// @brief Multiply a multi-digit ap_private by a single digit (64-bit) integer.
-static INLINE uint64_t mul_1(uint64_t dest[], const uint64_t x[], uint32_t len,
-                             uint64_t y) {
-  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
-  uint64_t ly = y & 0xffffffffULL, hy = (y) >> 32;
-  uint64_t carry = 0;
-  static const uint64_t two_power_32 = 1ULL << 32;
-  // For each digit of x.
-  for (uint32_t i = 0; i < len; ++i) {
-    // Split x into high and low words
-    uint64_t lx = x[i] & 0xffffffffULL;
-    uint64_t hx = (x[i]) >> 32;
-    // hasCarry - A flag to indicate if there is a carry to the next digit.
-    // hasCarry == 0, no carry
-    // hasCarry == 1, has carry
-    // hasCarry == 2, no carry and the calculation result == 0.
-    uint8_t hasCarry = 0;
-    dest[i] = carry + lx * ly;
-    // Determine if the add above introduces carry.
-    hasCarry = (dest[i] < carry) ? 1 : 0;
-    carry = hx * ly + ((dest[i]) >> 32) + (hasCarry ? two_power_32 : 0);
-    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
-    // (2^32 - 1) + 2^32 = 2^64.
-    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-
-    carry += (lx * hy) & 0xffffffffULL;
-    dest[i] = ((carry) << 32) | (dest[i] & 0xffffffffULL);
-    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? two_power_32 : 0) +
-            ((carry) >> 32) + ((lx * hy) >> 32) + hx * hy;
-  }
-  return carry;
-}
-
-/// Multiplies integer array x by integer array y and stores the result into
-/// the integer array dest. Note that dest's size must be >= xlen + ylen in
-/// order to
-/// do a full precision computation. If it is not, then only the low-order words
-/// are returned.
-/// @brief Generalized multiplicate of integer arrays.
-static INLINE void mul(uint64_t dest[], const uint64_t x[], uint32_t xlen,
-                       const uint64_t y[], uint32_t ylen, uint32_t destlen) {
-  assert(xlen > 0);
-  assert(ylen > 0);
-  assert(destlen >= xlen + ylen);
-  if (xlen < destlen) dest[xlen] = mul_1(dest, x, xlen, y[0]);
-  for (uint32_t i = 1; i < ylen; ++i) {
-    uint64_t ly = y[i] & 0xffffffffULL, hy = (y[i]) >> 32;
-    uint64_t carry = 0, lx = 0, hx = 0;
-    for (uint32_t j = 0; j < xlen; ++j) {
-      lx = x[j] & 0xffffffffULL;
-      hx = (x[j]) >> 32;
-      // hasCarry - A flag to indicate if has carry.
-      // hasCarry == 0, no carry
-      // hasCarry == 1, has carry
-      // hasCarry == 2, no carry and the calculation result == 0.
-      uint8_t hasCarry = 0;
-      uint64_t resul = carry + lx * ly;
-      hasCarry = (resul < carry) ? 1 : 0;
-      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + ((resul) >> 32);
-      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
-      carry += (lx * hy) & 0xffffffffULL;
-      resul = ((carry) << 32) | (resul & 0xffffffffULL);
-      if (i + j < destlen) dest[i + j] += resul;
-      carry =
-          (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
-          ((carry) >> 32) + (dest[i + j] < resul ? 1 : 0) + ((lx * hy) >> 32) +
-          hx * hy;
-    }
-    if (i + xlen < destlen) dest[i + xlen] = carry;
-  }
-}
-
-/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
-/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
-/// variables here have the same names as in the algorithm. Comments explain
-/// the algorithm and any deviation from it.
-static INLINE void KnuthDiv(uint32_t* u, uint32_t* v, uint32_t* q, uint32_t* r,
-                            uint32_t m, uint32_t n) {
-  assert(u && "Must provide dividend");
-  assert(v && "Must provide divisor");
-  assert(q && "Must provide quotient");
-  assert(u != v && u != q && v != q && "Must us different memory");
-  assert(n > 1 && "n must be > 1");
-
-  // Knuth uses the value b as the base of the number system. In our case b
-  // is 2^31 so we just set it to -1u.
-  uint64_t b = uint64_t(1) << 32;
-
-  // DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n');
-  // DEBUG(cerr << "KnuthDiv: original:");
-  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
-  // u[i]);
-  // DEBUG(cerr << " by");
-  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
-  // v[i-1]);
-  // DEBUG(cerr << '\n');
-  // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
-  // u and v by d. Note that we have taken Knuth's advice here to use a power
-  // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
-  // 2 allows us to shift instead of multiply and it is easy to determine the
-  // shift amount from the leading zeros.  We are basically normalizing the u
-  // and v so that its high bits are shifted to the top of v's range without
-  // overflow. Note that this can require an extra word in u so that u must
-  // be of length m+n+1.
-  uint32_t shift = CountLeadingZeros_32(v[n - 1]);
-  uint32_t v_carry = 0;
-  uint32_t u_carry = 0;
-  if (shift) {
-    for (uint32_t i = 0; i < m + n; ++i) {
-      uint32_t u_tmp = (u[i]) >> (32 - shift);
-      u[i] = ((u[i]) << (shift)) | u_carry;
-      u_carry = u_tmp;
-    }
-    for (uint32_t i = 0; i < n; ++i) {
-      uint32_t v_tmp = (v[i]) >> (32 - shift);
-      v[i] = ((v[i]) << (shift)) | v_carry;
-      v_carry = v_tmp;
-    }
-  }
-  u[m + n] = u_carry;
-  // DEBUG(cerr << "KnuthDiv:   normal:");
-  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
-  // u[i]);
-  // DEBUG(cerr << " by");
-  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
-  // v[i-1]);
-  // DEBUG(cerr << '\n');
-
-  // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
-  int j = m;
-  do {
-    // DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n');
-    // D3. [Calculate q'.].
-    //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
-    //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
-    // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
-    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
-    // on v[n-2] determines at high speed most of the cases in which the trial
-    // value qp is one too large, and it eliminates all cases where qp is two
-    // too large.
-    uint64_t dividend = ((uint64_t(u[j + n]) << 32) + u[j + n - 1]);
-    // DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n');
-    uint64_t qp = dividend / v[n - 1];
-    uint64_t rp = dividend % v[n - 1];
-    if (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2]) {
-      qp--;
-      rp += v[n - 1];
-      if (rp < b && (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2])) qp--;
-    }
-    // DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
-
-    // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
-    // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
-    // consists of a simple multiplication by a one-place number, combined with
-    // a subtraction.
-    bool isNeg = false;
-    for (uint32_t i = 0; i < n; ++i) {
-      uint64_t u_tmp = uint64_t(u[j + i]) | ((uint64_t(u[j + i + 1])) << 32);
-      uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
-      bool borrow = subtrahend > u_tmp;
-      /*DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp
-        << ", subtrahend == " << subtrahend
-        << ", borrow = " << borrow << '\n');*/
-
-      uint64_t result = u_tmp - subtrahend;
-      uint32_t k = j + i;
-      u[k++] = (uint32_t)(result & (b - 1)); // subtract low word
-      u[k++] = (uint32_t)((result) >> 32);   // subtract high word
-      while (borrow && k <= m + n) {         // deal with borrow to the left
-        borrow = u[k] == 0;
-        u[k]--;
-        k++;
-      }
-      isNeg |= borrow;
-      /*DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ",  u[j+i+1] == " <<
-        u[j+i+1] << '\n');*/
-    }
-    /*DEBUG(cerr << "KnuthDiv: after subtraction:");
-      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
-      DEBUG(cerr << '\n');*/
-    // The digits (u[j+n]...u[j]) should be kept positive; if the result of
-    // this step is actually negative, (u[j+n]...u[j]) should be left as the
-    // true value plus b**(n+1), namely as the b's complement of
-    // the true value, and a "borrow" to the left should be remembered.
-    //
-    if (isNeg) {
-      bool carry = true; // true because b's complement is "complement + 1"
-      for (uint32_t i = 0; i <= m + n; ++i) {
-        u[i] = ~u[i] + carry; // b's complement
-        carry = carry && u[i] == 0;
-      }
-    }
-    /*DEBUG(cerr << "KnuthDiv: after complement:");
-      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
-      DEBUG(cerr << '\n');*/
-
-    // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
-    // negative, go to step D6; otherwise go on to step D7.
-    q[j] = (uint32_t)qp;
-    if (isNeg) {
-      // D6. [Add back]. The probability that this step is necessary is very
-      // small, on the order of only 2/b. Make sure that test data accounts for
-      // this possibility. Decrease q[j] by 1
-      q[j]--;
-      // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]).
-      // A carry will occur to the left of u[j+n], and it should be ignored
-      // since it cancels with the borrow that occurred in D4.
-      bool carry = false;
-      for (uint32_t i = 0; i < n; i++) {
-        uint32_t limit = AESL_std::min(u[j + i], v[i]);
-        u[j + i] += v[i] + carry;
-        carry = u[j + i] < limit || (carry && u[j + i] == limit);
-      }
-      u[j + n] += carry;
-    }
-    /*DEBUG(cerr << "KnuthDiv: after correction:");
-      DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]);
-      DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');*/
-
-    // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
-  } while (--j >= 0);
-
-  /*DEBUG(cerr << "KnuthDiv: quotient:");
-    DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]);
-    DEBUG(cerr << '\n');*/
-
-  // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
-  // remainder may be obtained by dividing u[...] by d. If r is non-null we
-  // compute the remainder (urem uses this).
-  if (r) {
-    // The value d is expressed by the "shift" value above since we avoided
-    // multiplication by d by using a shift left. So, all we have to do is
-    // shift right here. In order to mak
-    if (shift) {
-      uint32_t carry = 0;
-      // DEBUG(cerr << "KnuthDiv: remainder:");
-      for (int i = n - 1; i >= 0; i--) {
-        r[i] = ((u[i]) >> (shift)) | carry;
-        carry = (u[i]) << (32 - shift);
-        // DEBUG(cerr << " " << r[i]);
-      }
-    } else {
-      for (int i = n - 1; i >= 0; i--) {
-        r[i] = u[i];
-        // DEBUG(cerr << " " << r[i]);
-      }
-    }
-    // DEBUG(cerr << '\n');
-  }
-  // DEBUG(cerr << std::setbase(10) << '\n');
-}
-
-template <int _AP_W, bool _AP_S>
-void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
-            const ap_private<_AP_W, _AP_S>& RHS, uint32_t rhsWords,
-            ap_private<_AP_W, _AP_S>* Quotient,
-            ap_private<_AP_W, _AP_S>* Remainder) {
-  assert(lhsWords >= rhsWords && "Fractional result");
-  enum { APINT_BITS_PER_WORD = 64 };
-  // First, compose the values into an array of 32-bit words instead of
-  // 64-bit words. This is a necessity of both the "short division" algorithm
-  // and the the Knuth "classical algorithm" which requires there to be native
-  // operations for +, -, and * on an m bit value with an m*2 bit result. We
-  // can't use 64-bit operands here because we don't have native results of
-  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
-  // work on large-endian machines.
-  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
-  uint32_t n = rhsWords * 2;
-  uint32_t m = (lhsWords * 2) - n;
-
-  // Allocate space for the temporary values we need either on the stack, if
-  // it will fit, or on the heap if it won't.
-  uint32_t SPACE[128];
-  uint32_t* __U = 0;
-  uint32_t* __V = 0;
-  uint32_t* __Q = 0;
-  uint32_t* __R = 0;
-  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
-    __U = &SPACE[0];
-    __V = &SPACE[m + n + 1];
-    __Q = &SPACE[(m + n + 1) + n];
-    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
-  } else {
-    __U = new uint32_t[m + n + 1];
-    __V = new uint32_t[n];
-    __Q = new uint32_t[m + n];
-    if (Remainder) __R = new uint32_t[n];
-  }
-
-  // Initialize the dividend
-  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
-  for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = LHS.get_pVal(i);
-    __U[i * 2] = (uint32_t)(tmp & mask);
-    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
-  }
-  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
-
-  // Initialize the divisor
-  memset(__V, 0, (n) * sizeof(uint32_t));
-  for (unsigned i = 0; i < rhsWords; ++i) {
-    uint64_t tmp = RHS.get_pVal(i);
-    __V[i * 2] = (uint32_t)(tmp & mask);
-    __V[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
-  }
-
-  // initialize the quotient and remainder
-  memset(__Q, 0, (m + n) * sizeof(uint32_t));
-  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
-
-  // Now, adjust m and n for the Knuth division. n is the number of words in
-  // the divisor. m is the number of words by which the dividend exceeds the
-  // divisor (i.e. m+n is the length of the dividend). These sizes must not
-  // contain any zero words or the Knuth algorithm fails.
-  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
-    n--;
-    m++;
-  }
-  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
-
-  // If we're left with only a single word for the divisor, Knuth doesn't work
-  // so we implement the short division algorithm here. This is much simpler
-  // and faster because we are certain that we can divide a 64-bit quantity
-  // by a 32-bit quantity at hardware speed and short division is simply a
-  // series of such operations. This is just like doing short division but we
-  // are using base 2^32 instead of base 10.
-  assert(n != 0 && "Divide by zero?");
-  if (n == 1) {
-    uint32_t divisor = __V[0];
-    uint32_t remainder = 0;
-    for (int i = m + n - 1; i >= 0; i--) {
-      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
-      if (partial_dividend == 0) {
-        __Q[i] = 0;
-        remainder = 0;
-      } else if (partial_dividend < divisor) {
-        __Q[i] = 0;
-        remainder = (uint32_t)partial_dividend;
-      } else if (partial_dividend == divisor) {
-        __Q[i] = 1;
-        remainder = 0;
-      } else {
-        __Q[i] = (uint32_t)(partial_dividend / divisor);
-        remainder = (uint32_t)(partial_dividend - (__Q[i] * divisor));
-      }
-    }
-    if (__R) __R[0] = remainder;
-  } else {
-    // Now we're ready to invoke the Knuth classical divide algorithm. In this
-    // case n > 1.
-    KnuthDiv(__U, __V, __Q, __R, m, n);
-  }
-
-  // If the caller wants the quotient
-  if (Quotient) {
-    // Set up the Quotient value's memory.
-    if (Quotient->BitWidth != LHS.BitWidth) {
-      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
-    } else
-      Quotient->clear();
-
-    // The quotient is in Q. Reconstitute the quotient into Quotient's low
-    // order words.
-    if (lhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
-      Quotient->set_VAL(tmp);
-    } else {
-      assert(!Quotient->isSingleWord() &&
-             "Quotient ap_private not large enough");
-      for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->set_pVal(
-            i, uint64_t(__Q[i * 2]) |
-                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Quotient->clearUnusedBits();
-  }
-
-  // If the caller wants the remainder
-  if (Remainder) {
-    // Set up the Remainder value's memory.
-    if (Remainder->BitWidth != RHS.BitWidth) {
-      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
-    } else
-      Remainder->clear();
-
-    // The remainder is in R. Reconstitute the remainder into Remainder's low
-    // order words.
-    if (rhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
-      Remainder->set_VAL(tmp);
-    } else {
-      assert(!Remainder->isSingleWord() &&
-             "Remainder ap_private not large enough");
-      for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->set_pVal(
-            i, uint64_t(__R[i * 2]) |
-                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Remainder->clearUnusedBits();
-  }
-
-  // Clean up the memory we allocated.
-  if (__U != &SPACE[0]) {
-    delete[] __U;
-    delete[] __V;
-    delete[] __Q;
-    delete[] __R;
-  }
-}
-
-template <int _AP_W, bool _AP_S>
-void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
-            uint64_t RHS, ap_private<_AP_W, _AP_S>* Quotient,
-            ap_private<_AP_W, _AP_S>* Remainder) {
-  uint32_t rhsWords = 1;
-  assert(lhsWords >= rhsWords && "Fractional result");
-  enum { APINT_BITS_PER_WORD = 64 };
-  // First, compose the values into an array of 32-bit words instead of
-  // 64-bit words. This is a necessity of both the "short division" algorithm
-  // and the the Knuth "classical algorithm" which requires there to be native
-  // operations for +, -, and * on an m bit value with an m*2 bit result. We
-  // can't use 64-bit operands here because we don't have native results of
-  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
-  // work on large-endian machines.
-  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
-  uint32_t n = 2;
-  uint32_t m = (lhsWords * 2) - n;
-
-  // Allocate space for the temporary values we need either on the stack, if
-  // it will fit, or on the heap if it won't.
-  uint32_t SPACE[128];
-  uint32_t* __U = 0;
-  uint32_t* __V = 0;
-  uint32_t* __Q = 0;
-  uint32_t* __R = 0;
-  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
-    __U = &SPACE[0];
-    __V = &SPACE[m + n + 1];
-    __Q = &SPACE[(m + n + 1) + n];
-    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
-  } else {
-    __U = new uint32_t[m + n + 1];
-    __V = new uint32_t[n];
-    __Q = new uint32_t[m + n];
-    if (Remainder) __R = new uint32_t[n];
-  }
-
-  // Initialize the dividend
-  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
-  for (unsigned i = 0; i < lhsWords; ++i) {
-    uint64_t tmp = LHS.get_pVal(i);
-    __U[i * 2] = tmp & mask;
-    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
-  }
-  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
-
-  // Initialize the divisor
-  memset(__V, 0, (n) * sizeof(uint32_t));
-  __V[0] = RHS & mask;
-  __V[1] = (RHS) >> (sizeof(uint32_t) * 8);
-
-  // initialize the quotient and remainder
-  memset(__Q, 0, (m + n) * sizeof(uint32_t));
-  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
-
-  // Now, adjust m and n for the Knuth division. n is the number of words in
-  // the divisor. m is the number of words by which the dividend exceeds the
-  // divisor (i.e. m+n is the length of the dividend). These sizes must not
-  // contain any zero words or the Knuth algorithm fails.
-  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
-    n--;
-    m++;
-  }
-  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
-
-  // If we're left with only a single word for the divisor, Knuth doesn't work
-  // so we implement the short division algorithm here. This is much simpler
-  // and faster because we are certain that we can divide a 64-bit quantity
-  // by a 32-bit quantity at hardware speed and short division is simply a
-  // series of such operations. This is just like doing short division but we
-  // are using base 2^32 instead of base 10.
-  assert(n != 0 && "Divide by zero?");
-  if (n == 1) {
-    uint32_t divisor = __V[0];
-    uint32_t remainder = 0;
-    for (int i = m + n - 1; i >= 0; i--) {
-      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
-      if (partial_dividend == 0) {
-        __Q[i] = 0;
-        remainder = 0;
-      } else if (partial_dividend < divisor) {
-        __Q[i] = 0;
-        remainder = partial_dividend;
-      } else if (partial_dividend == divisor) {
-        __Q[i] = 1;
-        remainder = 0;
-      } else {
-        __Q[i] = partial_dividend / divisor;
-        remainder = partial_dividend - (__Q[i] * divisor);
-      }
-    }
-    if (__R) __R[0] = remainder;
-  } else {
-    // Now we're ready to invoke the Knuth classical divide algorithm. In this
-    // case n > 1.
-    KnuthDiv(__U, __V, __Q, __R, m, n);
-  }
-
-  // If the caller wants the quotient
-  if (Quotient) {
-    // Set up the Quotient value's memory.
-    if (Quotient->BitWidth != LHS.BitWidth) {
-      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
-    } else
-      Quotient->clear();
-
-    // The quotient is in Q. Reconstitute the quotient into Quotient's low
-    // order words.
-    if (lhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
-      Quotient->set_VAL(tmp);
-    } else {
-      assert(!Quotient->isSingleWord() &&
-             "Quotient ap_private not large enough");
-      for (unsigned i = 0; i < lhsWords; ++i)
-        Quotient->set_pVal(
-            i, uint64_t(__Q[i * 2]) |
-                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Quotient->clearUnusedBits();
-  }
-
-  // If the caller wants the remainder
-  if (Remainder) {
-    // Set up the Remainder value's memory.
-    if (Remainder->BitWidth != 64 /* RHS.BitWidth */) {
-      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
-    } else
-      Remainder->clear();
-
-    // The remainder is in __R. Reconstitute the remainder into Remainder's low
-    // order words.
-    if (rhsWords == 1) {
-      uint64_t tmp =
-          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
-      Remainder->set_VAL(tmp);
-    } else {
-      assert(!Remainder->isSingleWord() &&
-             "Remainder ap_private not large enough");
-      for (unsigned i = 0; i < rhsWords; ++i)
-        Remainder->set_pVal(
-            i, uint64_t(__R[i * 2]) |
-                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
-    }
-    Remainder->clearUnusedBits();
-  }
-
-  // Clean up the memory we allocated.
-  if (__U != &SPACE[0]) {
-    delete[] __U;
-    delete[] __V;
-    delete[] __Q;
-    delete[] __R;
-  }
-}
-
-/// @brief Logical right-shift function.
-template <int _AP_W, bool _AP_S, bool _AP_C>
-INLINE ap_private<_AP_W, _AP_S, _AP_C> lshr(
-    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
-  return LHS.lshr(shiftAmt);
-}
-
-/// Left-shift the ap_private by shiftAmt.
-/// @brief Left-shift function.
-template <int _AP_W, bool _AP_S, bool _AP_C>
-INLINE ap_private<_AP_W, _AP_S, _AP_C> shl(
-    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
-  return LHS.shl(shiftAmt);
-}
-
-} // namespace ap_private_ops
-
-#endif // LLVM_SUPPORT_MATHEXTRAS_H
-
-/// This enumeration just provides for internal constants used in this
-/// translation unit.
-enum {
-  MIN_INT_BITS = 1, ///< Minimum number of bits that can be specified
-  ///< Note that this must remain synchronized with IntegerType::MIN_INT_BITS
-  MAX_INT_BITS = (1 << 23) - 1 ///< Maximum number of bits that can be specified
-  ///< Note that this must remain synchronized with IntegerType::MAX_INT_BITS
-};
-
-//===----------------------------------------------------------------------===//
-//                              ap_private Class
-//===----------------------------------------------------------------------===//
-
-/// ap_private - This class represents arbitrary precision constant integral
-/// values.
-/// It is a functional replacement for common case unsigned integer type like
-/// "unsigned", "unsigned long" or "uint64_t", but also allows non-byte-width
-/// integer sizes and large integer value types such as 3-bits, 15-bits, or more
-/// than 64-bits of precision. ap_private provides a variety of arithmetic
-/// operators
-/// and methods to manipulate integer values of any bit-width. It supports both
-/// the typical integer arithmetic and comparison operations as well as bitwise
-/// manipulation.
-///
-/// The class has several invariants worth noting:
-///   * All bit, byte, and word positions are zero-based.
-///   * Once the bit width is set, it doesn't change except by the Truncate,
-///     SignExtend, or ZeroExtend operations.
-///   * All binary operators must be on ap_private instances of the same bit
-///   width.
-///     Attempting to use these operators on instances with different bit
-///     widths will yield an assertion.
-///   * The value is stored canonically as an unsigned value. For operations
-///     where it makes a difference, there are both signed and unsigned variants
-///     of the operation. For example, sdiv and udiv. However, because the bit
-///     widths must be the same, operations such as Mul and Add produce the same
-///     results regardless of whether the values are interpreted as signed or
-///     not.
-///   * In general, the class tries to follow the style of computation that LLVM
-///     uses in its IR. This simplifies its use for LLVM.
-///
-/// @brief Class for arbitrary precision integers.
-
-#if defined(_MSC_VER)
-#if _MSC_VER < 1400 && !defined(for)
-#define for if (0); else for
-#endif
-typedef unsigned __int64 ap_ulong;
-typedef signed __int64 ap_slong;
-#else
-typedef unsigned long long ap_ulong;
-typedef signed long long ap_slong;
-#endif
-template <int _AP_N8, bool _AP_S>
-struct valtype;
-
-template <int _AP_N8>
-struct valtype<_AP_N8, false> {
-  typedef uint64_t Type;
-};
-
-template <int _AP_N8>
-struct valtype<_AP_N8, true> {
-  typedef int64_t Type;
-};
-
-template <>
-struct valtype<1, false> {
-  typedef unsigned char Type;
-};
-template <>
-struct valtype<2, false> {
-  typedef unsigned short Type;
-};
-template <>
-struct valtype<3, false> {
-  typedef unsigned int Type;
-};
-template <>
-struct valtype<4, false> {
-  typedef unsigned int Type;
-};
-template <>
-struct valtype<1, true> {
-  typedef signed char Type;
-};
-template <>
-struct valtype<2, true> {
-  typedef short Type;
-};
-template <>
-struct valtype<3, true> {
-  typedef int Type;
-};
-template <>
-struct valtype<4, true> {
-  typedef int Type;
-};
-
-template <bool enable>
-struct ap_private_enable_if {};
-template <>
-struct ap_private_enable_if<true> {
-  static const bool isValid = true;
-};
-
-// When bitwidth < 64
-template <int _AP_W, bool _AP_S>
-class ap_private<_AP_W, _AP_S, true> {
-  // SFINAE pattern.  Only consider this class when _AP_W <= 64
-  const static bool valid = ap_private_enable_if<_AP_W <= 64>::isValid;
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
- public:
-  typedef typename valtype<(_AP_W + 7) / 8, _AP_S>::Type ValType;
-  typedef ap_private<_AP_W, _AP_S> Type;
-  template <int _AP_W2, bool _AP_S2>
-  struct RType {
-    enum {
-      mult_w = _AP_W + _AP_W2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-      div_w = _AP_W + _AP_S2,
-      div_s = _AP_S || _AP_S2,
-      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
-      mod_s = _AP_S,
-      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-    typedef ap_private<mult_w, mult_s> mult;
-    typedef ap_private<plus_w, plus_s> plus;
-    typedef ap_private<minus_w, minus_s> minus;
-    typedef ap_private<logic_w, logic_s> logic;
-    typedef ap_private<div_w, div_s> div;
-    typedef ap_private<mod_w, mod_s> mod;
-    typedef ap_private<_AP_W, _AP_S> arg1;
-    typedef bool reduce;
-  };
-  enum { APINT_BITS_PER_WORD = sizeof(uint64_t) * 8 };
-  enum {
-    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
-                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
-                      : 0
-  };
-  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
-  static const uint64_t not_mask = ~mask;
-  static const uint64_t sign_bit_mask = 1ULL << (APINT_BITS_PER_WORD - 1);
-  template <int _AP_W1>
-  struct sign_ext_mask {
-    static const uint64_t mask = ~0ULL << _AP_W1;
-  };
-  static const int width = _AP_W;
-
-  enum {
-    BitWidth = _AP_W,
-    _AP_N = 1,
-  };
-  ValType VAL; ///< Used to store the <= 64 bits integer value.
-#ifdef AP_CANARY
-  ValType CANARY;
-  void check_canary() { assert(CANARY == (ValType)0xDEADBEEFDEADBEEF); }
-  void set_canary() { CANARY = (ValType)0xDEADBEEFDEADBEEF; }
-#else
-  void check_canary() {}
-  void set_canary() {}
-#endif
-
-  INLINE ValType& get_VAL(void) { return VAL; }
-  INLINE ValType get_VAL(void) const { return VAL; }
-  INLINE ValType get_VAL(void) const volatile { return VAL; }
-  INLINE void set_VAL(uint64_t value) { VAL = (ValType)value; }
-  INLINE ValType& get_pVal(int i) { return VAL; }
-  INLINE ValType get_pVal(int i) const { return VAL; }
-  INLINE const uint64_t* get_pVal() const {
-    assert(0 && "invalid usage");
-    return 0;
-  }
-  INLINE ValType get_pVal(int i) const volatile { return VAL; }
-  INLINE uint64_t* get_pVal() const volatile {
-    assert(0 && "invalid usage");
-    return 0;
-  }
-  INLINE void set_pVal(int i, uint64_t value) { VAL = (ValType)value; }
-
-  INLINE uint32_t getBitWidth() const { return BitWidth; }
-
-  template <int _AP_W1, bool _AP_S1>
-  ap_private<_AP_W, _AP_S>& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  ap_private<_AP_W, _AP_S>& operator=(
-      const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(RHS.get_VAL()); // TODO check here about ap_private<W,S,false>
-    clearUnusedBits();
-    return *this;
-  }
-
-  void operator=(const ap_private& RHS) volatile {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-  }
-
-  ap_private& operator=(const ap_private& RHS) {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-    return *this;
-  }
-
-  void operator=(const volatile ap_private& RHS) volatile {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-  }
-
-  ap_private& operator=(const volatile ap_private& RHS) {
-    // Don't do anything for X = X
-    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    *this = ap_private<_AP_W2, false>(op2);
-    return *this;
-  }
-
-#define ASSIGN_OP_FROM_INT(C_TYPE)               \
-  INLINE ap_private& operator=(const C_TYPE v) { \
-    set_canary();                                \
-    this->VAL = (ValType)v;                      \
-    clearUnusedBits();                           \
-    check_canary();                              \
-    return *this;                                \
-  }
-
-ASSIGN_OP_FROM_INT(bool)
-ASSIGN_OP_FROM_INT(char)
-ASSIGN_OP_FROM_INT(signed char)
-ASSIGN_OP_FROM_INT(unsigned char)
-ASSIGN_OP_FROM_INT(short)
-ASSIGN_OP_FROM_INT(unsigned short)
-ASSIGN_OP_FROM_INT(int)
-ASSIGN_OP_FROM_INT(unsigned int)
-ASSIGN_OP_FROM_INT(long)
-ASSIGN_OP_FROM_INT(unsigned long)
-ASSIGN_OP_FROM_INT(ap_slong)
-ASSIGN_OP_FROM_INT(ap_ulong)
-#if 0
-ASSIGN_OP_FROM_INT(half)
-ASSIGN_OP_FROM_INT(float)
-ASSIGN_OP_FROM_INT(double)
-#endif
-#undef ASSIGN_OP_FROM_INT
-
-  // XXX This is a must to prevent pointer being converted to bool.
-  INLINE ap_private& operator=(const char* s) {
-    ap_private tmp(s); // XXX direct-initialization, as ctor is explicit.
-    operator=(tmp);
-    return *this;
-  }
-
- private:
-  explicit INLINE ap_private(uint64_t* val) : VAL(val[0]) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  INLINE bool isSingleWord() const { return true; }
-
- public:
-  INLINE void fromString(const char* strStart, uint32_t slen, uint8_t radix) {
-    bool isNeg = strStart[0] == '-';
-    if (isNeg) {
-      strStart++;
-      slen--;
-    }
-
-    if (strStart[0] == '0' && (strStart[1] == 'b' || strStart[1] == 'B')) {
-      //if(radix == 0) radix = 2;
-      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", strStart, 2, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (strStart[0] == '0' && (strStart[1] == 'o' || strStart[1] == 'O')) {
-      //if (radix == 0) radix = 8;
-      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", strStart, 8, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (strStart[0] == '0' && (strStart[1] == 'x' || strStart[1] == 'X')) {
-      //if (radix == 0) radix = 16;
-      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", strStart, 16, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (strStart[0] == '0' && (strStart[1] == 'd' || strStart[1] == 'D')) {
-      //if (radix == 0) radix = 10;
-      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", strStart, 10, radix);
-      strStart += 2;
-      slen -=2;
-    } else if (radix == 0) {
-      //radix = 2; // XXX default value
-    }
-
-    // Check our assumptions here
-    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-           "Radix should be 2, 8, 10, or 16!");
-    assert(strStart && "String is null?");
-
-    // Clear bits.
-    uint64_t tmpVAL = VAL = 0;
-
-    switch (radix) {
-      case 2:
-        //        sscanf(strStart,"%b",&VAL);
-        // tmpVAL = *strStart =='1' ? ~0ULL : 0;
-        for (; *strStart; ++strStart) {
-          assert((*strStart == '0' || *strStart == '1') &&
-                 ("Wrong binary number"));
-          tmpVAL <<= 1;
-          tmpVAL |= (*strStart - '0');
-        }
-        break;
-      case 8:
-#ifdef _MSC_VER
-        sscanf_s(strStart, "%llo", &tmpVAL, slen + 1);
-#else
-#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
-        sscanf(strStart, "%lo", &tmpVAL);
-#else
-        sscanf(strStart, "%llo", &tmpVAL);
-#endif //__x86_64__
-#endif //_MSC_VER
-        break;
-      case 10:
-#ifdef _MSC_VER
-        sscanf_s(strStart, "%llu", &tmpVAL, slen + 1);
-#else
-#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
-        sscanf(strStart, "%lu", &tmpVAL);
-#else
-        sscanf(strStart, "%llu", &tmpVAL);
-#endif //__x86_64__
-#endif //_MSC_VER
-        break;
-      case 16:
-#ifdef _MSC_VER
-        sscanf_s(strStart, "%llx", &tmpVAL, slen + 1);
-#else
-#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
-        sscanf(strStart, "%lx", &tmpVAL);
-#else
-        sscanf(strStart, "%llx", &tmpVAL);
-#endif //__x86_64__
-#endif //_MSC_VER
-        break;
-      default:
-        assert(true && "Unknown radix");
-        // error
-    }
-    VAL = isNeg ? (ValType)(-tmpVAL) : (ValType)(tmpVAL);
-
-    clearUnusedBits();
-  }
-
- private:
-  INLINE ap_private(const std::string& val, uint8_t radix = 2) : VAL(0) {
-    assert(!val.empty() && "String empty?");
-    set_canary();
-    fromString(val.c_str(), val.size(), radix);
-    check_canary();
-  }
-
-  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix)
-      : VAL(0) {
-    set_canary();
-    fromString(strStart, slen, radix);
-    check_canary();
-  }
-
-  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[])
-      : VAL(bigVal[0]) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
- public:
-  INLINE ap_private() {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-#define CTOR(TYPE)                              \
-  INLINE ap_private(TYPE v) : VAL((ValType)v) { \
-    set_canary();                               \
-    clearUnusedBits();                          \
-    check_canary();                             \
-  }
-  CTOR(bool)
-  CTOR(char)
-  CTOR(signed char)
-  CTOR(unsigned char)
-  CTOR(short)
-  CTOR(unsigned short)
-  CTOR(int)
-  CTOR(unsigned int)
-  CTOR(long)
-  CTOR(unsigned long)
-  CTOR(ap_slong)
-  CTOR(ap_ulong)
-#if 0
-  CTOR(half)
-  CTOR(float)
-  CTOR(double)
-#endif
-#undef CTOR
-
-  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
-  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
-      : VAL((ValType)that.get_VAL()) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
-  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
-      : VAL((ValType)that.get_VAL()) {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  explicit INLINE ap_private(const char* val) {
-    set_canary();
-    unsigned char radix = 10;
-    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
-    std::string::size_type pos = str.find('.');
-    // trunc all fraction part
-    if (pos != std::string::npos) str = str.substr(pos);
-
-    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
-    operator=(ap_private_val);
-    check_canary();
-  }
-
-  INLINE ap_private(const char* val, signed char rd) {
-    set_canary();
-    unsigned char radix = rd;
-    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
-    std::string::size_type pos = str.find('.');
-    // trunc all fraction part
-    if (pos != std::string::npos) str = str.substr(pos);
-
-    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
-    operator=(ap_private_val);
-    check_canary();
-  }
-
-  INLINE ~ap_private() { check_canary(); }
-
-  INLINE bool isNegative() const {
-    static const uint64_t sign_mask = 1ULL << (_AP_W - 1);
-    return _AP_S && (sign_mask & VAL);
-  }
-
-  INLINE bool isPositive() const { return !isNegative(); }
-
-  INLINE bool isStrictlyPositive() const { return !isNegative() && VAL != 0; }
-
-  INLINE bool isAllOnesValue() const { return (mask & VAL) == mask; }
-
-  INLINE bool operator==(const ap_private<_AP_W, _AP_S>& RHS) const {
-    return VAL == RHS.get_VAL();
-  }
-  INLINE bool operator==(const ap_private<_AP_W, !_AP_S>& RHS) const {
-    return (uint64_t)VAL == (uint64_t)RHS.get_VAL();
-  }
-
-  INLINE bool operator==(uint64_t Val) const { return ((uint64_t)VAL == Val); }
-  INLINE bool operator!=(uint64_t Val) const { return ((uint64_t)VAL != Val); }
-  INLINE bool operator!=(const ap_private<_AP_W, _AP_S>& RHS) const {
-    return VAL != RHS.get_VAL();
-  }
-  INLINE bool operator!=(const ap_private<_AP_W, !_AP_S>& RHS) const {
-    return (uint64_t)VAL != (uint64_t)RHS.get_VAL();
-  }
-
-  /// postfix increment.
-  const ap_private operator++(int) {
-    ap_private orig(*this);
-    VAL++;
-    clearUnusedBits();
-    return orig;
-  }
-
-  /// prefix increment.
-  const ap_private operator++() {
-    ++VAL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// postfix decrement.
-  const ap_private operator--(int) {
-    ap_private orig(*this);
-    --VAL;
-    clearUnusedBits();
-    return orig;
-  }
-
-  /// prefix decrement.
-  const ap_private operator--() {
-    --VAL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// one's complement.
-  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
-    ap_private<_AP_W + !_AP_S, true> Result(*this);
-    Result.flip();
-    return Result;
-  }
-
-  /// two's complement.
-  INLINE typename RType<1, false>::minus operator-() const {
-    return ap_private<1, false>(0) - (*this);
-  }
-
-  /// logic negation.
-  INLINE bool operator!() const { return !VAL; }
-
-  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
-  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
-    return toString(radix, false);
-  }
-  INLINE std::string toStringSigned(uint8_t radix = 10) const {
-    return toString(radix, true);
-  }
-  INLINE void clear() { VAL = 0; }
-  INLINE ap_private& clear(uint32_t bitPosition) {
-    VAL &= ~(1ULL << (bitPosition));
-    clearUnusedBits();
-    return *this;
-  }
-
-  INLINE ap_private ashr(uint32_t shiftAmt) const {
-    if (_AP_S)
-      return ap_private((shiftAmt == BitWidth) ? 0
-                                               : ((int64_t)VAL) >> (shiftAmt));
-    else
-      return ap_private((shiftAmt == BitWidth) ? 0
-                                               : ((uint64_t)VAL) >> (shiftAmt));
-  }
-
-  INLINE ap_private lshr(uint32_t shiftAmt) const {
-    return ap_private((shiftAmt == BitWidth)
-                          ? ap_private(0)
-                          : ap_private((VAL & mask) >> (shiftAmt)));
-  }
-
-  INLINE ap_private shl(uint32_t shiftAmt) const
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    if (shiftAmt > BitWidth) {
-      if (!isNegative())
-        return ap_private(0);
-      else
-        return ap_private(-1);
-    }
-    if (shiftAmt == BitWidth)
-      return ap_private(0);
-    else
-      return ap_private((VAL) << (shiftAmt));
-    // return ap_private((shiftAmt == BitWidth) ? ap_private(0ULL) :
-    // ap_private(VAL << shiftAmt));
-  }
-
-  INLINE int64_t getSExtValue() const { return VAL; }
-
-  // XXX XXX this function is used in CBE
-  INLINE uint64_t getZExtValue() const { return VAL & mask; }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ref.get();
-    check_canary();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ((uint64_t)(bool)ref);
-    check_canary();
-  }
-
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
-//    set_canary();
-//    *this = ref.get();
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = ((val.operator ap_private<_AP_W2, false>()));
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = (uint64_t)(bool)val;
-//    check_canary();
-//  }
-
-  INLINE void write(const ap_private<_AP_W, _AP_S>& op2) volatile {
-    *this = (op2);
-  }
-
-  // Explicit conversions to C interger types
-  //-----------------------------------------------------------
-  INLINE operator ValType() const { return get_VAL(); }
-
-  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
-
-  INLINE int to_char() const { return (signed char)get_VAL(); }
-
-  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
-
-  INLINE int to_short() const { return (short)get_VAL(); }
-
-  INLINE int to_int() const {
-    //      ap_private<64 /* _AP_W */, _AP_S> res(V);
-    return (int)get_VAL();
-  }
-
-  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
-
-  INLINE long to_long() const { return (long)get_VAL(); }
-
-  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
-
-  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
-
-  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
-
-  INLINE double to_double() const {
-    if (isNegative())
-      return roundToDouble(true);
-    else
-      return roundToDouble(false);
-  }
-
-  INLINE unsigned length() const { return _AP_W; }
-
-  INLINE bool isMinValue() const { return VAL == 0; }
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator&=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) & RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator|=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) | RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator^=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) ^ RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) * RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) + RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    VAL = (ValType)(((uint64_t)VAL) - RHS.get_VAL());
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator&(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) &
-                                                RHS.get_VAL());
-      return Ret;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
-      return Ret & RHS;
-    }
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator^(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) ^
-                                                RHS.get_VAL());
-      return Ret;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
-      return Ret ^ RHS;
-    }
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator|(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) |
-                                                RHS.get_VAL());
-      return Ret;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
-      return Ret | RHS;
-    }
-  }
-
-  INLINE ap_private And(const ap_private& RHS) const {
-    return ap_private(VAL & RHS.get_VAL());
-  }
-
-  INLINE ap_private Or(const ap_private& RHS) const {
-    return ap_private(VAL | RHS.get_VAL());
-  }
-
-  INLINE ap_private Xor(const ap_private& RHS) const {
-    return ap_private(VAL ^ RHS.get_VAL());
-  }
-#if 1
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::mult_w <= 64) {
-      typename RType<_AP_W1, _AP_S1>::mult Result(((uint64_t)VAL) *
-                                                  RHS.get_VAL());
-      return Result;
-    } else {
-      typename RType<_AP_W1, _AP_S1>::mult Result(*this);
-      Result *= RHS;
-      return Result;
-    }
-  }
-#endif
-  INLINE ap_private Mul(const ap_private& RHS) const {
-    return ap_private(VAL * RHS.get_VAL());
-  }
-
-  INLINE ap_private Add(const ap_private& RHS) const {
-    return ap_private(VAL + RHS.get_VAL());
-  }
-
-  INLINE ap_private Sub(const ap_private& RHS) const {
-    return ap_private(VAL - RHS.get_VAL());
-  }
-
-  INLINE ap_private& operator&=(uint64_t RHS) {
-    VAL &= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator|=(uint64_t RHS) {
-    VAL |= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator^=(uint64_t RHS) {
-    VAL ^= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator*=(uint64_t RHS) {
-    VAL *= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator+=(uint64_t RHS) {
-    VAL += (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator-=(uint64_t RHS) {
-    VAL -= (ValType)RHS;
-    clearUnusedBits();
-    return *this;
-  }
-
-  INLINE bool isMinSignedValue() const {
-    static const uint64_t min_mask = ~(~0ULL << (_AP_W - 1));
-    return BitWidth == 1 ? VAL == 1
-                         : (ap_private_ops::isNegative<_AP_W>(*this) &&
-                            ((min_mask & VAL) == 0));
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::plus_w <= 64)
-      return typename RType<_AP_W1, _AP_S1>::plus(
-          RType<_AP_W1, _AP_S1>::plus_s
-              ? int64_t(((uint64_t)VAL) + RHS.get_VAL())
-              : uint64_t(((uint64_t)VAL) + RHS.get_VAL()));
-    typename RType<_AP_W1, _AP_S1>::plus Result = RHS;
-    Result += VAL;
-    return Result;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (RType<_AP_W1, _AP_S1>::minus_w <= 64)
-      return typename RType<_AP_W1, _AP_S1>::minus(
-          int64_t(((uint64_t)VAL) - RHS.get_VAL()));
-    typename RType<_AP_W1, _AP_S1>::minus Result = *this;
-    Result -= RHS;
-    return Result;
-  }
-
-  INLINE uint32_t countPopulation() const {
-    return ap_private_ops::CountPopulation_64(VAL);
-  }
-  INLINE uint32_t countLeadingZeros() const {
-    int remainder = BitWidth % 64;
-    int excessBits = (64 - remainder) % 64;
-    uint32_t Count = ap_private_ops::CountLeadingZeros_64(VAL);
-    if (Count) Count -= excessBits;
-    return AESL_std::min(Count, (uint32_t)_AP_W);
-  }
-
-  /// HiBits - This function returns the high "numBits" bits of this ap_private.
-  INLINE ap_private<_AP_W, _AP_S> getHiBits(uint32_t numBits) const {
-    ap_private<_AP_W, _AP_S> ret(*this);
-    ret = (ret) >> (BitWidth - numBits);
-    return ret;
-  }
-
-  /// LoBits - This function returns the low "numBits" bits of this ap_private.
-  INLINE ap_private<_AP_W, _AP_S> getLoBits(uint32_t numBits) const {
-    ap_private<_AP_W, _AP_S> ret(((uint64_t)VAL) << (BitWidth - numBits));
-    ret = (ret) >> (BitWidth - numBits);
-    return ret;
-    // return ap_private(numBits, (VAL << (BitWidth - numBits))>> (BitWidth -
-    // numBits));
-  }
-
-  INLINE ap_private<_AP_W, _AP_S>& set(uint32_t bitPosition) {
-    VAL |= (1ULL << (bitPosition));
-    clearUnusedBits();
-    return *this; // clearUnusedBits();
-  }
-
-  INLINE void set() {
-    VAL = (ValType)~0ULL;
-    clearUnusedBits();
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_private<_AP_W3, false>& val) {
-    operator=(ap_private<_AP_W3, _AP_S>(val));
-  }
-
-  INLINE void set(const ap_private& val) { operator=(val); }
-
-  INLINE void clearUnusedBits(void) volatile
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
-    VAL = (ValType)(
-        _AP_S
-            ? ((((int64_t)VAL) << (excess_bits)) >> (excess_bits))
-            : (excess_bits ? (((uint64_t)VAL) << (excess_bits)) >> (excess_bits)
-                           : (uint64_t)VAL));
-  }
-
-  INLINE void clearUnusedBitsToZero(void) {
-    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
-    static uint64_t mask = ~0ULL >> (excess_bits);
-    VAL &= mask;
-  }
-
-  INLINE ap_private udiv(const ap_private& RHS) const {
-    return ap_private((uint64_t)VAL / RHS.get_VAL());
-  }
-
-  /// Signed divide this ap_private by ap_private RHS.
-  /// @brief Signed division function for ap_private.
-  INLINE ap_private sdiv(const ap_private& RHS) const {
-    if (isNegative())
-      if (RHS.isNegative())
-        return ((uint64_t)(0 - (*this))) / (uint64_t)(0 - RHS);
-      else
-        return 0 - ((uint64_t)(0 - (*this)) / (uint64_t)(RHS));
-    else if (RHS.isNegative())
-      return 0 - (this->udiv((ap_private)(0 - RHS)));
-    return this->udiv(RHS);
-  }
-
-  template <bool _AP_S2>
-  INLINE ap_private urem(const ap_private<_AP_W, _AP_S2>& RHS) const {
-    assert(RHS.get_VAL() != 0 && "Divide by 0");
-    return ap_private(((uint64_t)VAL) % ((uint64_t)RHS.get_VAL()));
-  }
-
-  /// Signed remainder operation on ap_private.
-  /// @brief Function for signed remainder operation.
-  template <bool _AP_S2>
-  INLINE ap_private srem(const ap_private<_AP_W, _AP_S2>& RHS) const {
-    if (isNegative()) {
-      ap_private lhs = 0 - (*this);
-      if (RHS.isNegative()) {
-        ap_private rhs = 0 - RHS;
-        return 0 - (lhs.urem(rhs));
-      } else
-        return 0 - (lhs.urem(RHS));
-    } else if (RHS.isNegative()) {
-      ap_private rhs = 0 - RHS;
-      return this->urem(rhs);
-    }
-    return this->urem(RHS);
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool eq(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return (*this) == RHS;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ne(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !((*this) == RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the less-than relationship.
-  /// @returns true if *this < RHS when both are considered unsigned.
-  /// @brief Unsigned less than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ult(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    if (_AP_W1 <= 64) {
-      uint64_t lhsZext = ((uint64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
-      uint64_t rhsZext =
-          ((uint64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
-      return lhsZext < rhsZext;
-    } else
-      return RHS.uge(*this);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the less-than relationship.
-  /// @returns true if *this < RHS when both are considered signed.
-  /// @brief Signed less than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool slt(const ap_private<_AP_W1, _AP_S1>& RHS) const
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    if (_AP_W1 <= 64) {
-      int64_t lhsSext = ((int64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
-      int64_t rhsSext =
-          ((int64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
-      return lhsSext < rhsSext;
-    } else
-      return RHS.sge(*this);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered unsigned.
-  /// @brief Unsigned less or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ule(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return ult(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered signed.
-  /// @brief Signed less or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool sle(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return slt(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered unsigned.
-  /// @brief Unsigned greather than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool ugt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !ult(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered signed.
-  /// @brief Signed greather than comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool sgt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !slt(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered unsigned.
-  /// @brief Unsigned greater or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool uge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !ult(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered signed.
-  /// @brief Signed greather or equal comparison
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool sge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    return !slt(RHS);
-  }
-
-  INLINE ap_private abs() const {
-    if (isNegative()) return -(*this);
-    return *this;
-  }
-
-  INLINE ap_private<_AP_W, false> get() const {
-    ap_private<_AP_W, false> ret(*this);
-    return ret;
-  }
-
-  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
-                                       uint8_t radix) {
-    return _AP_W;
-  }
-
-  INLINE uint32_t getActiveBits() const {
-    uint32_t bits = _AP_W - countLeadingZeros();
-    return bits ? bits : 1;
-  }
-
-  INLINE double roundToDouble(bool isSigned = false) const {
-    return isSigned ? double((int64_t)VAL) : double((uint64_t)VAL);
-  }
-
-  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
-   * versa*/
-  INLINE ap_private& reverse() {
-    for (int i = 0; i < _AP_W / 2; ++i) {
-      bool tmp = operator[](i);
-      if (operator[](_AP_W - 1 - i))
-        set(i);
-      else
-        clear(i);
-      if (tmp)
-        set(_AP_W - 1 - i);
-      else
-        clear(_AP_W - 1 - i);
-    }
-    clearUnusedBits();
-    return *this;
-  }
-
-  /*Return true if the value of ap_private instance is zero*/
-  INLINE bool iszero() const { return isMinValue(); }
-
-  INLINE bool to_bool() const { return !iszero(); }
-
-  /* x < 0 */
-  INLINE bool sign() const {
-    if (isNegative()) return true;
-    return false;
-  }
-
-  /* x[i] = !x[i] */
-  INLINE void invert(int i) {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    flip(i);
-  }
-
-  /* x[i] */
-  INLINE bool test(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return operator[](i);
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the left
-  INLINE void lrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(shl(n) | lshr(_AP_W - n));
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the right
-  INLINE void rrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(lshr(n) | shl(_AP_W - n));
-  }
-
-  // Set the ith bit into v
-  INLINE void set(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // Set the ith bit into v
-  INLINE void set_bit(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // Get the value of ith bit
-  INLINE bool get_bit(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return (((1ULL << i) & VAL) != 0);
-  }
-
-  /// Toggle all bits.
-  INLINE ap_private& flip() {
-    VAL = (ValType)((~0ULL ^ VAL) & mask);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// Toggles a given bit to its opposite value.
-  INLINE ap_private& flip(uint32_t bitPosition) {
-    assert(bitPosition < BitWidth && "Out of the bit-width range!");
-    set_bit(bitPosition, !get_bit(bitPosition));
-    return *this;
-  }
-
-  // complements every bit
-  INLINE void b_not() { flip(); }
-
-// Binary Arithmetic
-//-----------------------------------------------------------
-#define OP_BIN_AP(Sym, Rty, Fun)                           \
-  template <int _AP_W2, bool _AP_S2>                       \
-  INLINE typename RType<_AP_W2, _AP_S2>::Rty operator Sym( \
-      const ap_private<_AP_W2, _AP_S2>& op) const {        \
-    typename RType<_AP_W2, _AP_S2>::Rty lhs(*this);        \
-    typename RType<_AP_W2, _AP_S2>::Rty rhs(op);           \
-    return lhs.Fun(rhs);                                   \
-  }
-
-/// Bitwise and, or, xor
-// OP_BIN_AP(&,logic, And)
-// OP_BIN_AP(|,logic, Or)
-// OP_BIN_AP(^,logic, Xor)
-#undef OP_BIN_AP
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    return typename RType<_AP_W2, _AP_S2>::div(
-        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    typename RType<_AP_W2, _AP_S2>::mod res =
-        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
-                                                  : lhs.urem(rhs));
-    return res;
-  }
-
-#define OP_ASSIGN_AP_2(Sym)                         \
-  template <int _AP_W2, bool _AP_S2>                \
-  INLINE ap_private<_AP_W, _AP_S>& operator Sym##=( \
-      const ap_private<_AP_W2, _AP_S2>& op) {       \
-    *this = operator Sym(op);                       \
-    return *this;                                   \
-  }
-
-  OP_ASSIGN_AP_2(/)
-  OP_ASSIGN_AP_2(%)
-#undef OP_ASSIGN_AP_2
-
-/// Bitwise assign: and, or, xor
-//-------------------------------------------------------------
-//    OP_ASSIGN_AP(&)
-//    OP_ASSIGN_AP(^)
-//    OP_ASSIGN_AP(|)
-
-#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
-  INLINE ap_private operator<<(const TYPE op) const { \
-    if (op >= _AP_W) return ap_private(0);            \
-    if (SIGNED && op < 0) return *this >> (0 - op);   \
-    return shl(op);                                   \
-  }
-
-  // OP_LEFT_SHIFT_CTYPE(bool, false)
-  OP_LEFT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
-  OP_LEFT_SHIFT_CTYPE(signed char, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
-  OP_LEFT_SHIFT_CTYPE(short, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
-  OP_LEFT_SHIFT_CTYPE(int, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
-  OP_LEFT_SHIFT_CTYPE(long, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
-  OP_LEFT_SHIFT_CTYPE(long long, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
-#if 0
-  OP_LEFT_SHIFT_CTYPE(half, false)
-  OP_LEFT_SHIFT_CTYPE(float, false)
-  OP_LEFT_SHIFT_CTYPE(double, false)
-#endif
-
-#undef OP_LEFT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this << sh;
-    } else {
-      int sh = op2.to_int();
-      return *this << sh;
-    }
-  }
-
-#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
-  INLINE ap_private operator>>(const TYPE op) const { \
-    if (op >= _AP_W) {                                \
-      if (isNegative())                               \
-        return ap_private(-1);                        \
-      else                                            \
-        return ap_private(0);                         \
-    }                                                 \
-    if ((SIGNED) && op < 0) return *this << (0 - op); \
-    if (_AP_S)                                        \
-      return ashr(op);                                \
-    else                                              \
-      return lshr(op);                                \
-  }
-
-  // OP_RIGHT_SHIFT_CTYPE(bool, false)
-  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
-  OP_RIGHT_SHIFT_CTYPE(signed char, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
-  OP_RIGHT_SHIFT_CTYPE(short, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
-  OP_RIGHT_SHIFT_CTYPE(int, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
-  OP_RIGHT_SHIFT_CTYPE(long, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
-  OP_RIGHT_SHIFT_CTYPE(long long, true)
-#if 0
-  OP_RIGHT_SHIFT_CTYPE(half, false)
-  OP_RIGHT_SHIFT_CTYPE(float, false)
-  OP_RIGHT_SHIFT_CTYPE(double, false)
-#endif
-
-#undef OP_RIGHT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this >> sh;
-    } else {
-      int sh = op2.to_int();
-      return *this >> sh;
-    }
-  }
-
-  /// Shift assign
-  //-----------------------------------------------------------------
-
-  //INLINE const ap_private& operator<<=(uint32_t shiftAmt) {
-  //  VAL <<= shiftAmt;
-  //  clearUnusedBits();
-  //  return *this;
-  //}
-
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(int op) {                               \
-    *this = operator Sym(op);                                                \
-    clearUnusedBits();                                                       \
-    return *this;                                                            \
-  }                                                                          \
-  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
-    *this = operator Sym(op);                                                \
-    clearUnusedBits();                                                       \
-    return *this;                                                            \
-  }                                                                          \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
-    *this = operator Sym(op);                                                \
-    clearUnusedBits();                                                       \
-    return *this;                                                            \
-  }
-
-  OP_ASSIGN_AP(>>)
-  OP_ASSIGN_AP(<<)
-#undef OP_ASSIGN_AP
-
-  /// Comparisons
-  //-----------------------------------------------------------------
-  template <int _AP_W1, bool _AP_S1>
-  INLINE bool operator==(const ap_private<_AP_W1, _AP_S1>& op) const {
-    enum { _AP_MAX_W = AP_MAX(AP_MAX(_AP_W, _AP_W1), 32) };
-    ap_private<_AP_MAX_W, false> lhs(*this);
-    ap_private<_AP_MAX_W, false> rhs(op);
-    if (_AP_MAX_W <= 64) {
-      return (uint64_t)lhs.get_VAL() == (uint64_t)rhs.get_VAL();
-    } else
-      return lhs == rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this == op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    // this will follow gcc rule for comparison
-    // between different bitwidth and signness
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
-    else if (_AP_W < 32 && _AP_W2 < 32)
-      // different signness but both bitwidth is less than 32
-      return lhs.sgt(rhs);
-    else
-        // different signness but bigger bitwidth
-        // is greater or equal to 32
-        if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ugt(rhs);
-      else
-        return lhs.sgt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ugt(rhs);
-    else
-      return lhs.sgt(rhs);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this > op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
-    else if (_AP_W < 32 && _AP_W2 < 32)
-      return lhs.slt(rhs);
-    else if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ult(rhs);
-      else
-        return lhs.slt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ult(rhs);
-    else
-      return lhs.slt(rhs);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this < op);
-  }
-
-  /// Bit and Part Select
-  //--------------------------------------------------------------
-  // FIXME now _private_range_ref refs to _AP_ROOT_TYPE(struct ssdm_int).
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-//                                                                         a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-//                                                                       a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                    &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-//            a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(
-//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this & a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this | a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this ^ a2.get();
-//  }
-
-  // Reduce operation
-  //-----------------------------------------------------------
-  INLINE bool and_reduce() const { return (VAL & mask) == mask; }
-
-  INLINE bool nand_reduce() const { return (VAL & mask) != mask; }
-
-  INLINE bool or_reduce() const { return (bool)VAL; }
-
-  INLINE bool nor_reduce() const { return VAL == 0; }
-
-  INLINE bool xor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? true : false;
-  }
-
-  INLINE bool xnor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? false : true;
-  }
-
-  INLINE std::string to_string(uint8_t radix = 2, bool sign = false) const {
-    return toString(radix, radix == 10 ? _AP_S : sign);
-  }
-}; // End of class ap_private <_AP_W, _AP_S, true>
-
-template <int _AP_W, bool _AP_S>
-std::string ap_private<_AP_W, _AP_S, true>::toString(uint8_t radix,
-                                                     bool wantSigned) const {
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-         "Radix should be 2, 8, 10, or 16!");
-  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
-                                 "8", "9", "a", "b", "c", "d", "e", "f"};
-  std::string result;
-  if (radix != 10) {
-    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
-    // because the number of bits per digit (1,3 and 4 respectively) divides
-    // equaly. We just shift until there value is zero.
-
-    // First, check for a zero value and just short circuit the logic below.
-    if (*this == (uint64_t)(0)) {
-      // Always generate a radix indicator because fixed-point
-      // formats require it.
-      switch (radix) {
-        case 2:
-          result = "0b0";
-          break;
-        case 8:
-          result = "0o0";
-          break;
-        case 16:
-          result = "0x0";
-          break;
-        default:
-          assert("invalid radix" && 0);
-      }
-    } else {
-      ap_private<_AP_W, false, true> tmp(*this);
-      size_t insert_at = 0;
-      bool leading_zero = true;
-      if (wantSigned && isNegative()) {
-        // They want to print the signed version and it is a negative value
-        // Flip the bits and add one to turn it into the equivalent positive
-        // value and put a '-' in the result.
-        tmp.flip();
-        tmp++;
-        result = "-";
-        insert_at = 1;
-        leading_zero = false;
-      }
-      switch (radix) {
-        case 2:
-          result += "0b";
-          break;
-        case 8:
-          result += "0o";
-          break;
-        case 16:
-          result += "0x";
-          break;
-        default:
-          assert("invalid radix" && 0);
-      }
-      insert_at += 2;
-
-      // Just shift tmp right for each digit width until it becomes zero
-      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
-      uint64_t mask = radix - 1;
-      ap_private<_AP_W, false, true> zero(0);
-      unsigned bits = 0;
-      bool msb = false;
-      while (tmp.ne(zero)) {
-        unsigned digit = (unsigned)(tmp.get_VAL() & mask);
-        result.insert(insert_at, digits[digit]);
-        tmp = tmp.lshr(shift);
-        bits++;
-        msb = (digit >> (shift - 1)) == 1;
-      }
-      bits *= shift;
-      if (bits < _AP_W && leading_zero && msb)
-        result.insert(insert_at, digits[0]);
-    }
-    return result;
-  }
-
-  ap_private<_AP_W, false, true> tmp(*this);
-  ap_private<6, false, true> divisor(radix);
-  ap_private<_AP_W, _AP_S, true> zero(0);
-  size_t insert_at = 0;
-  if (wantSigned && isNegative()) {
-    // They want to print the signed version and it is a negative value
-    // Flip the bits and add one to turn it into the equivalent positive
-    // value and put a '-' in the result.
-    tmp.flip();
-    tmp++;
-    result = "-";
-    insert_at = 1;
-  }
-  if (tmp == ap_private<_AP_W, false, true>(0ULL))
-    result = "0";
-  else
-    while (tmp.ne(zero)) {
-      ap_private<_AP_W, false, true> APdigit = tmp % divisor;
-      ap_private<_AP_W, false, true> tmp2 = tmp / divisor;
-      uint32_t digit = (uint32_t)(APdigit.getZExtValue());
-      assert(digit < radix && "divide failed");
-      result.insert(insert_at, digits[digit]);
-      tmp = tmp2;
-    }
-  return result;
-
-} // End of ap_private<_AP_W, _AP_S, true>::toString()
-
-// bitwidth > 64
-template <int _AP_W, bool _AP_S>
-class ap_private<_AP_W, _AP_S, false> {
-  // SFINAE pattern.  Only consider this class when _AP_W > 64
-  const static bool valid = ap_private_enable_if<(_AP_W > 64)>::isValid;
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
- public:
-  enum { BitWidth = _AP_W, _AP_N = (_AP_W + 63) / 64 };
-  static const int width = _AP_W;
-
- private:
-  /// This constructor is used only internally for speed of construction of
-  /// temporaries. It is unsafe for general use so it is not public.
-
-  /* Constructors */
-  /// Note that numWords can be smaller or larger than the corresponding bit
-  /// width but any extraneous bits will be dropped.
-  /// @param numWords the number of words in bigVal
-  /// @param bigVal a sequence of words to form the initial value of the
-  /// ap_private
-  /// @brief Construct an ap_private, initialized as bigVal[].
-  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[]) {
-    set_canary();
-    assert(bigVal && "Null pointer detected!");
-    {
-      // Get memory, cleared to 0
-      memset(pVal, 0, _AP_N * sizeof(uint64_t));
-
-      // Calculate the number of words to copy
-      uint32_t words = AESL_std::min<uint32_t>(numWords, _AP_N);
-      // Copy the words from bigVal to pVal
-      memcpy(pVal, bigVal, words * APINT_WORD_SIZE);
-      if (words >= _AP_W) clearUnusedBits();
-      // Make sure unused high bits are cleared
-    }
-    check_canary();
-  }
-
-  /// This constructor interprets Val as a string in the given radix. The
-  /// interpretation stops when the first charater that is not suitable for the
-  /// radix is encountered. Acceptable radix values are 2, 8, 10 and 16. It is
-  /// an error for the value implied by the string to require more bits than
-  /// numBits.
-  /// @param val the string to be interpreted
-  /// @param radix the radix of Val to use for the intepretation
-  /// @brief Construct an ap_private from a string representation.
-  INLINE ap_private(const std::string& val, uint8_t radix = 2) {
-    set_canary();
-    assert(!val.empty() && "The input string is empty.");
-    const char* c_str = val.c_str();
-    fromString(c_str, val.size(), radix);
-    check_canary();
-  }
-
-  /// This constructor interprets the slen characters starting at StrStart as
-  /// a string in the given radix. The interpretation stops when the first
-  /// character that is not suitable for the radix is encountered. Acceptable
-  /// radix values are 2, 8, 10 and 16. It is an error for the value implied by
-  /// the string to require more bits than numBits.
-  /// @param strStart the start of the string to be interpreted
-  /// @param slen the maximum number of characters to interpret
-  /// @param radix the radix to use for the conversion
-  /// @brief Construct an ap_private from a string representation.
-  /// This method does not consider whether it is negative or not.
-  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix) {
-    set_canary();
-    fromString(strStart, slen, radix);
-    check_canary();
-  }
-
-  INLINE void report() {
-    _AP_ERROR(_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024,
-              "ap_%sint<%d>: Bitwidth exceeds the "
-              "default max value %d. Please use macro "
-              "AP_INT_MAX_W to set a larger max value.",
-              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
-  }
-  /// This union is used to store the integer value. When the
-  /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal.
-
-  /// This enum is used to hold the constants we needed for ap_private.
-  // uint64_t VAL;    ///< Used to store the <= 64 bits integer value.
-  uint64_t pVal[_AP_N]; ///< Used to store the >64 bits integer value.
-#ifdef AP_CANARY
-  uint64_t CANARY;
-  INLINE void check_canary() { assert(CANARY == (uint64_t)0xDEADBEEFDEADBEEF); }
-  INLINE void set_canary() { CANARY = (uint64_t)0xDEADBEEFDEADBEEF; }
-#else
-  INLINE void check_canary() {}
-  INLINE void set_canary() {}
-#endif
-
- public:
-  typedef typename valtype<8, _AP_S>::Type ValType;
-  typedef ap_private<_AP_W, _AP_S> Type;
-  // FIXME remove friend type?
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  friend struct ap_fixed_base;
-  /// return type of variety of operations
-  //----------------------------------------------------------
-  template <int _AP_W2, bool _AP_S2>
-  struct RType {
-    enum {
-      mult_w = _AP_W + _AP_W2,
-      mult_s = _AP_S || _AP_S2,
-      plus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      plus_s = _AP_S || _AP_S2,
-      minus_w =
-          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
-      minus_s = true,
-      div_w = _AP_W + _AP_S2,
-      div_s = _AP_S || _AP_S2,
-      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
-      mod_s = _AP_S,
-      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
-      logic_s = _AP_S || _AP_S2
-    };
-    typedef ap_private<mult_w, mult_s> mult;
-    typedef ap_private<plus_w, plus_s> plus;
-    typedef ap_private<minus_w, minus_s> minus;
-    typedef ap_private<logic_w, logic_s> logic;
-    typedef ap_private<div_w, div_s> div;
-    typedef ap_private<mod_w, mod_s> mod;
-    typedef ap_private<_AP_W, _AP_S> arg1;
-    typedef bool reduce;
-  };
-
-  INLINE uint64_t& get_VAL(void) { return pVal[0]; }
-  INLINE uint64_t get_VAL(void) const { return pVal[0]; }
-  INLINE uint64_t get_VAL(void) const volatile { return pVal[0]; }
-  INLINE void set_VAL(uint64_t value) { pVal[0] = value; }
-  INLINE uint64_t& get_pVal(int index) { return pVal[index]; }
-  INLINE uint64_t* get_pVal() { return pVal; }
-  INLINE const uint64_t* get_pVal() const { return pVal; }
-  INLINE uint64_t get_pVal(int index) const { return pVal[index]; }
-  INLINE uint64_t* get_pVal() const volatile { return pVal; }
-  INLINE uint64_t get_pVal(int index) const volatile { return pVal[index]; }
-  INLINE void set_pVal(int i, uint64_t value) { pVal[i] = value; }
-
-  /// This enum is used to hold the constants we needed for ap_private.
-  enum {
-    APINT_BITS_PER_WORD = sizeof(uint64_t) * 8, ///< Bits in a word
-    APINT_WORD_SIZE = sizeof(uint64_t)          ///< Byte size of a word
-  };
-
-  enum {
-    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
-                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
-                      : 0
-  };
-  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
-
- public:
-  // NOTE changed to explicit to be consistent with ap_private<W,S,true>
-  explicit INLINE ap_private(const char* val) {
-    set_canary();
-    unsigned char radix = 10;
-    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
-    std::string::size_type pos = str.find('.');
-    if (pos != std::string::npos) str = str.substr(pos);
-    ap_private ap_private_val(str, radix);
-    operator=(ap_private_val);
-    report();
-    check_canary();
-  }
-
-  INLINE ap_private(const char* val, unsigned char rd) {
-    set_canary();
-    unsigned char radix = rd;
-    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
-    std::string::size_type pos = str.find('.');
-    if (pos != std::string::npos) str = str.substr(pos);
-    ap_private ap_private_val(str, radix);
-    operator=(ap_private_val);
-    report();
-
-    report();
-    check_canary();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ref.get();
-    report();
-    check_canary();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
-    set_canary();
-    *this = ((uint64_t)(bool)ref);
-    report();
-    check_canary();
-  }
-
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
-//    set_canary();
-//    *this = ref.get();
-//    report();
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = ((val.operator ap_private<_AP_W2, false>()));
-//    report();
-//    check_canary();
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_private(
-//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-//    set_canary();
-//    *this = (uint64_t)(bool)val;
-//    report();
-//    check_canary();
-//  }
-
-  /// Simply makes *this a copy of that.
-  /// @brief Copy Constructor.
-  INLINE ap_private(const ap_private& that) {
-      set_canary();
-      memcpy(pVal, that.get_pVal(), _AP_N * APINT_WORD_SIZE);
-      clearUnusedBits();
-      check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, false>& that) {
-    set_canary();
-    operator=(that);
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, false>& that) {
-    set_canary();
-    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, false>&>(that));
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, true>& that) {
-    set_canary();
-    static const uint64_t that_sign_ext_mask =
-        (_AP_W1 == APINT_BITS_PER_WORD)
-            ? 0
-            : ~0ULL >> (_AP_W1 % APINT_BITS_PER_WORD)
-                           << (_AP_W1 % APINT_BITS_PER_WORD);
-    if (that.isNegative()) {
-      pVal[0] = that.get_VAL() | that_sign_ext_mask;
-      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1));
-    } else {
-      pVal[0] = that.get_VAL();
-      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));
-    }
-    clearUnusedBits();
-    check_canary();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, true>& that) {
-    set_canary();
-    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, true>&>(that));
-    check_canary();
-  }
-
-  /// @brief Destructor.
-  // virtual ~ap_private() {}
-  INLINE ~ap_private() { check_canary(); }
-
-  /// @name Constructors
-  /// @{
-
-  /// Default constructor that creates an uninitialized ap_private.  This is
-  /// useful
-  ///  for object deserialization (pair this with the static method Read).
-  INLINE ap_private() {
-    set_canary();
-    clearUnusedBits();
-    check_canary();
-  }
-
-  INLINE ap_private(uint64_t* val, uint32_t bits = _AP_W) { assert(0); }
-  INLINE ap_private(const uint64_t* const val, uint32_t bits) { assert(0); }
-
-/// If isSigned is true then val is treated as if it were a signed value
-/// (i.e. as an int64_t) and the appropriate sign extension to the bit width
-/// will be done. Otherwise, no sign extension occurs (high order bits beyond
-/// the range of val are zero filled).
-/// @param numBits the bit width of the constructed ap_private
-/// @param val the initial value of the ap_private
-/// @param isSigned how to treat signedness of val
-/// @brief Create a new ap_private of numBits width, initialized as val.
-#define CTOR(TYPE, SIGNED)                                  \
-  INLINE ap_private(TYPE val, bool isSigned = SIGNED) {     \
-    set_canary();                                           \
-    pVal[0] = (ValType)val;                                 \
-    if (isSigned && int64_t(pVal[0]) < 0) {                 \
-      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1)); \
-    } else {                                                \
-      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));  \
-    }                                                       \
-    clearUnusedBits();                                      \
-    check_canary();                                         \
-  }
-
-  CTOR(bool, false)
-  CTOR(char, CHAR_IS_SIGNED)
-  CTOR(signed char, true)
-  CTOR(unsigned char, false)
-  CTOR(short, true)
-  CTOR(unsigned short, false)
-  CTOR(int, true)
-  CTOR(unsigned int, false)
-  CTOR(long, true)
-  CTOR(unsigned long, false)
-  CTOR(ap_slong, true)
-  CTOR(ap_ulong, false)
-#if 0
-  CTOR(half, false)
-  CTOR(float, false)
-  CTOR(double, false)
-#endif
-#undef CTOR
-
-  /// @returns true if the number of bits <= 64, false otherwise.
-  /// @brief Determine if this ap_private just has one word to store value.
-  INLINE bool isSingleWord() const { return false; }
-
-  /// @returns the word position for the specified bit position.
-  /// @brief Determine which word a bit is in.
-  static INLINE uint32_t whichWord(uint32_t bitPosition) {
-    //    return bitPosition / APINT_BITS_PER_WORD;
-    return (bitPosition) >> 6;
-  }
-
-  /// @returns the bit position in a word for the specified bit position
-  /// in the ap_private.
-  /// @brief Determine which bit in a word a bit is in.
-  static INLINE uint32_t whichBit(uint32_t bitPosition) {
-    //    return bitPosition % APINT_BITS_PER_WORD;
-    return bitPosition & 0x3f;
-  }
-
-  /// bit at a specific bit position. This is used to mask the bit in the
-  /// corresponding word.
-  /// @returns a uint64_t with only bit at "whichBit(bitPosition)" set
-  /// @brief Get a single bit mask.
-  static INLINE uint64_t maskBit(uint32_t bitPosition) {
-    return 1ULL << (whichBit(bitPosition));
-  }
-
-  /// @returns the corresponding word for the specified bit position.
-  /// @brief Get the word corresponding to a bit position
-  INLINE uint64_t getWord(uint32_t bitPosition) const {
-    return pVal[whichWord(bitPosition)];
-  }
-
-  /// This method is used internally to clear the to "N" bits in the high order
-  /// word that are not used by the ap_private. This is needed after the most
-  /// significant word is assigned a value to ensure that those bits are
-  /// zero'd out.
-  /// @brief Clear unused high order bits
-  INLINE void clearUnusedBits(void) volatile
-// just for clang compiler
-#if defined(__clang__) && !defined(__CLANG_3_1__)
-      __attribute__((no_sanitize("undefined")))
-#endif
-  {
-    pVal[_AP_N - 1] =
-        _AP_S ? ((((int64_t)pVal[_AP_N - 1]) << (excess_bits)) >> excess_bits)
-              : (excess_bits
-                     ? ((pVal[_AP_N - 1]) << (excess_bits)) >> (excess_bits)
-                     : pVal[_AP_N - 1]);
-  }
-
-  INLINE void clearUnusedBitsToZero(void) { pVal[_AP_N - 1] &= mask; }
-
-  INLINE void clearUnusedBitsToOne(void) { pVal[_AP_N - 1] |= mask; }
-
-  /// This is used by the constructors that take string arguments.
-  /// @brief Convert a char array into an ap_private
-  INLINE void fromString(const char* str, uint32_t slen, uint8_t radix) {
-    enum { numbits = _AP_W };
-    bool isNeg = str[0] == '-';
-    if (isNeg) {
-      str++;
-      slen--;
-    }
-
-    if (str[0] == '0' && (str[1] == 'b' || str[1] == 'B')) {
-      //if(radix == 0) radix = 2;
-      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", str, 2, radix);
-      str += 2;
-      slen -=2;
-    } else if (str[0] == '0' && (str[1] == 'o' || str[1] == 'O')) {
-      //if (radix == 0) radix = 8;
-      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", str, 8, radix);
-      str += 2;
-      slen -=2;
-    } else if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) {
-      //if (radix == 0) radix = 16;
-      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", str, 16, radix);
-      str += 2;
-      slen -=2;
-    } else if (str[0] == '0' && (str[1] == 'd' || str[1] == 'D')) {
-      //if (radix == 0) radix = 10;
-      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", str, 10, radix);
-      str += 2;
-      slen -=2;
-    } else if (radix == 0) {
-      //radix = 2; // XXX default value
-    }
-
-    // Check our assumptions here
-    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-           "Radix should be 2, 8, 10, or 16!");
-    assert(str && "String is null?");
-
-    // skip any leading zero
-    while (*str == '0' && *(str + 1) != '\0') {
-      str++;
-      slen--;
-    }
-    assert((slen <= numbits || radix != 2) && "Insufficient bit width");
-    assert(((slen - 1) * 3 <= numbits || radix != 8) &&
-           "Insufficient bit width");
-    assert(((slen - 1) * 4 <= numbits || radix != 16) &&
-           "Insufficient bit width");
-    assert((((slen - 1) * 64) / 22 <= numbits || radix != 10) &&
-           "Insufficient bit width");
-
-    // clear bits
-    memset(pVal, 0, _AP_N * sizeof(uint64_t));
-
-    // Figure out if we can shift instead of multiply
-    uint32_t shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
-
-    // Set up an ap_private for the digit to add outside the loop so we don't
-    // constantly construct/destruct it.
-    uint64_t bigVal[_AP_N];
-    memset(bigVal, 0, _AP_N * sizeof(uint64_t));
-    ap_private<_AP_W, _AP_S> apdigit(getBitWidth(), bigVal);
-    ap_private<_AP_W, _AP_S> apradix(radix);
-
-    // Enter digit traversal loop
-    for (unsigned i = 0; i < slen; i++) {
-      // Get a digit
-      uint32_t digit = 0;
-      char cdigit = str[i];
-      if (radix == 16) {
-#define isxdigit(c)                                            \
-  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
-   ((c) >= 'A' && (c) <= 'F'))
-#define isdigit(c) ((c) >= '0' && (c) <= '9')
-        if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
-        if (isdigit(cdigit))
-          digit = cdigit - '0';
-        else if (cdigit >= 'a')
-          digit = cdigit - 'a' + 10;
-        else if (cdigit >= 'A')
-          digit = cdigit - 'A' + 10;
-        else
-          assert(0 && "huh? we shouldn't get here");
-      } else if (isdigit(cdigit)) {
-        digit = cdigit - '0';
-      } else if (cdigit != '\0') {
-        assert(0 && "Invalid character in digit string");
-      }
-#undef isxdigit
-#undef isdigit
-      // Shift or multiply the value by the radix
-      if (shift)
-        *this <<= shift;
-      else
-        *this *= apradix;
-
-      // Add in the digit we just interpreted
-      apdigit.set_VAL(digit);
-      *this += apdigit;
-    }
-    // If its negative, put it in two's complement form
-    if (isNeg) {
-      (*this)--;
-      this->flip();
-    }
-    clearUnusedBits();
-  }
-
-  INLINE ap_private read() volatile { return *this; }
-
-  INLINE void write(const ap_private& op2) volatile { *this = (op2); }
-
-  INLINE operator ValType() const { return get_VAL(); }
-
-  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
-
-  INLINE int to_char() const { return (signed char)get_VAL(); }
-
-  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
-
-  INLINE int to_short() const { return (short)get_VAL(); }
-
-  INLINE int to_int() const { return (int)get_VAL(); }
-
-  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
-
-  INLINE long to_long() const { return (long)get_VAL(); }
-
-  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
-
-  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
-
-  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
-
-  INLINE double to_double() const {
-    if (isNegative())
-      return roundToDouble(true);
-    else
-      return roundToDouble(false);
-  }
-
-  INLINE unsigned length() const { return _AP_W; }
-
-  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
-   * versa*/
-  INLINE ap_private& reverse() {
-    for (int i = 0; i < _AP_W / 2; ++i) {
-      bool tmp = operator[](i);
-      if (operator[](_AP_W - 1 - i))
-        set(i);
-      else
-        clear(i);
-      if (tmp)
-        set(_AP_W - 1 - i);
-      else
-        clear(_AP_W - 1 - i);
-    }
-    clearUnusedBits();
-    return *this;
-  }
-
-  /*Return true if the value of ap_private instance is zero*/
-  INLINE bool iszero() const { return isMinValue(); }
-
-  INLINE bool to_bool() const { return !iszero(); }
-
-  /* x < 0 */
-  INLINE bool sign() const {
-    if (isNegative()) return true;
-    return false;
-  }
-
-  /* x[i] = !x[i] */
-  INLINE void invert(int i) {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    flip(i);
-  }
-
-  /* x[i] */
-  INLINE bool test(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return operator[](i);
-  }
-
-  // Set the ith bit into v
-  INLINE void set(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // Set the ith bit into v
-  INLINE void set_bit(int i, bool v) {
-    assert(i >= 0 && "Attempting to write bit with negative index");
-    assert(i < _AP_W && "Attempting to write bit beyond MSB");
-    v ? set(i) : clear(i);
-  }
-
-  // FIXME different argument for different action?
-  INLINE ap_private& set(uint32_t bitPosition) {
-    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
-    clearUnusedBits();
-    return *this;
-  }
-
-  INLINE void set() {
-    for (int i = 0; i < _AP_N; ++i) pVal[i] = ~0ULL;
-    clearUnusedBits();
-  }
-
-  // Get the value of ith bit
-  INLINE bool get(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
-  }
-
-  // Get the value of ith bit
-  INLINE bool get_bit(int i) const {
-    assert(i >= 0 && "Attempting to read bit with negative index");
-    assert(i < _AP_W && "Attempting to read bit beyond MSB");
-    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the left
-  INLINE void lrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(shl(n) | lshr(_AP_W - n));
-  }
-
-  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
-  // Rotate an ap_private object n places to the right
-  INLINE void rrotate(int n) {
-    assert(n >= 0 && "Attempting to shift negative index");
-    assert(n < _AP_W && "Shift value larger than bit width");
-    operator=(lshr(n) | shl(_AP_W - n));
-  }
-
-  /// Set the given bit to 0 whose position is given as "bitPosition".
-  /// @brief Set a given bit to 0.
-  INLINE ap_private& clear(uint32_t bitPosition) {
-    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// @brief Set every bit to 0.
-  INLINE void clear() { memset(pVal, 0, _AP_N * APINT_WORD_SIZE); }
-
-  /// @brief Toggle every bit to its opposite value.
-  ap_private& flip() {
-    for (int i = 0; i < _AP_N; ++i) pVal[i] ^= ~0ULL;
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// @brief Toggles a given bit to its opposite value.
-  INLINE ap_private& flip(uint32_t bitPosition) {
-    assert(bitPosition < BitWidth && "Out of the bit-width range!");
-    set_bit(bitPosition, !get_bit(bitPosition));
-    return *this;
-  }
-
-  // complements every bit
-  INLINE void b_not() { flip(); }
-
-  INLINE ap_private getLoBits(uint32_t numBits) const {
-    return ap_private_ops::lshr(ap_private_ops::shl(*this, _AP_W - numBits),
-                                _AP_W - numBits);
-  }
-
-  INLINE ap_private getHiBits(uint32_t numBits) const {
-    return ap_private_ops::lshr(*this, _AP_W - numBits);
-  }
-
-  // Binary Arithmetic
-  //-----------------------------------------------------------
-
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this & a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this | a2.get();
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
-//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
-//    return *this ^ a2.get();
-//  }
-
-/// Arithmetic assign
-//-------------------------------------------------------------
-
-#define OP_BIN_LOGIC_ASSIGN_AP(Sym)                                            \
-  template <int _AP_W1, bool _AP_S1>                                           \
-  INLINE ap_private& operator Sym(const ap_private<_AP_W1, _AP_S1>& RHS) {     \
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                      \
-    uint32_t numWords = AESL_std::min((int)_AP_N, _AP_N1);                     \
-    uint32_t i;                                                                \
-    if (_AP_W != _AP_W1)                                                       \
-      fprintf(stderr,                                                          \
-              "Warning! Bitsize mismach for ap_[u]int " #Sym " ap_[u]int.\n"); \
-    for (i = 0; i < numWords; ++i) pVal[i] Sym RHS.get_pVal(i);                \
-    if (_AP_N1 < _AP_N) {                                                      \
-      uint64_t ext = RHS.isNegative() ? ~0ULL : 0;                             \
-      for (; i < _AP_N; i++) pVal[i] Sym ext;                                  \
-    }                                                                          \
-    clearUnusedBits();                                                         \
-    return *this;                                                              \
-  }
-
-  OP_BIN_LOGIC_ASSIGN_AP(&=);
-  OP_BIN_LOGIC_ASSIGN_AP(|=);
-  OP_BIN_LOGIC_ASSIGN_AP(^=);
-#undef OP_BIN_LOGIC_ASSIGN_AP
-
-  /// Adds the RHS APint to this ap_private.
-  /// @returns this, after addition of RHS.
-  /// @brief Addition assignment operator.
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    uint64_t RHSpVal[_AP_N1];
-    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
-    ap_private_ops::add(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
-                        _AP_S1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    uint64_t RHSpVal[_AP_N1];
-    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
-    ap_private_ops::sub(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
-                        _AP_S1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    // Get some bit facts about LHS and check for zero
-    uint32_t lhsBits = getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
-    if (!lhsWords) {
-      // 0 * X ===> 0
-      return *this;
-    }
-
-    ap_private dupRHS = RHS;
-    // Get some bit facts about RHS and check for zero
-    uint32_t rhsBits = dupRHS.getActiveBits();
-    uint32_t rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
-    if (!rhsWords) {
-      // X * 0 ===> 0
-      clear();
-      return *this;
-    }
-
-    // Allocate space for the result
-    uint32_t destWords = rhsWords + lhsWords;
-    uint64_t* dest = (uint64_t*)malloc(destWords * sizeof(uint64_t));
-
-    // Perform the long multiply
-    ap_private_ops::mul(dest, pVal, lhsWords, dupRHS.get_pVal(), rhsWords,
-                        destWords);
-
-    // Copy result back into *this
-    clear();
-    uint32_t wordsToCopy = destWords >= _AP_N ? _AP_N : destWords;
-
-    memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
-
-    uint64_t ext = (isNegative() ^ RHS.isNegative()) ? ~0ULL : 0ULL;
-    for (int i = wordsToCopy; i < _AP_N; i++) pVal[i] = ext;
-    clearUnusedBits();
-    // delete dest array and return
-    free(dest);
-    return *this;
-  }
-
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }
-
-  OP_ASSIGN_AP(/)
-  OP_ASSIGN_AP(%)
-#undef OP_ASSIGN_AP
-
-#define OP_BIN_LOGIC_AP(Sym)                                                  \
-  template <int _AP_W1, bool _AP_S1>                                          \
-  INLINE typename RType<_AP_W1, _AP_S1>::logic operator Sym(                  \
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {                          \
-    enum {                                                                    \
-      numWords = (RType<_AP_W1, _AP_S1>::logic_w + APINT_BITS_PER_WORD - 1) / \
-                 APINT_BITS_PER_WORD                                          \
-    };                                                                        \
-    typename RType<_AP_W1, _AP_S1>::logic Result;                             \
-    uint32_t i;                                                               \
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                     \
-    uint32_t min_N = std::min((int)_AP_N, _AP_N1);                            \
-    uint32_t max_N = std::max((int)_AP_N, _AP_N1);                            \
-    for (i = 0; i < min_N; ++i)                                               \
-      Result.set_pVal(i, pVal[i] Sym RHS.get_pVal(i));                        \
-    if (numWords > i) {                                                       \
-      uint64_t ext = ((_AP_N < _AP_N1 && isNegative()) ||                     \
-                      (_AP_N1 < _AP_N && RHS.isNegative()))                   \
-                         ? ~0ULL                                              \
-                         : 0;                                                 \
-      if (_AP_N > _AP_N1)                                                     \
-        for (; i < max_N; i++) Result.set_pVal(i, pVal[i] Sym ext);           \
-      else                                                                    \
-        for (; i < max_N; i++) Result.set_pVal(i, RHS.get_pVal(i) Sym ext);   \
-      if (numWords > i) {                                                     \
-        uint64_t ext2 = ((_AP_N > _AP_N1 && isNegative()) ||                  \
-                         (_AP_N1 > _AP_N && RHS.isNegative()))                \
-                            ? ~0ULL                                           \
-                            : 0;                                              \
-        Result.set_pVal(i, ext Sym ext2);                                     \
-      }                                                                       \
-    }                                                                         \
-    Result.clearUnusedBits();                                                 \
-    return Result;                                                            \
-  }
-
-  OP_BIN_LOGIC_AP(|);
-  OP_BIN_LOGIC_AP(&);
-  OP_BIN_LOGIC_AP(^);
-
-#undef OP_BIN_LOGIC_AP
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    typename RType<_AP_W1, _AP_S1>::plus Result, lhs(*this), rhs(RHS);
-    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::plus_w + 63) / 64;
-    ap_private_ops::add(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
-                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    typename RType<_AP_W1, _AP_S1>::minus Result, lhs(*this), rhs(RHS);
-    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::minus_w + 63) / 64;
-    ap_private_ops::sub(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
-                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
-      const ap_private<_AP_W1, _AP_S1>& RHS) const {
-    typename RType<_AP_W1, _AP_S1>::mult temp = *this;
-    temp *= RHS;
-    return temp;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    return typename RType<_AP_W2, _AP_S2>::div(
-        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
-      const ap_private<_AP_W2, _AP_S2>& op) const {
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        lhs = *this;
-    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
-               (_AP_W > _AP_W2 ? _AP_S
-                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
-        rhs = op;
-    typename RType<_AP_W2, _AP_S2>::mod res =
-        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
-                                                  : lhs.urem(rhs));
-    return res;
-  }
-
-#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
-  INLINE ap_private operator<<(const TYPE op) const { \
-    if (op >= _AP_W) return ap_private(0);            \
-    if (SIGNED && op < 0) return *this >> (0 - op);   \
-    return shl(op);                                   \
-  }
-
-  OP_LEFT_SHIFT_CTYPE(int, true)
-  // OP_LEFT_SHIFT_CTYPE(bool, false)
-  OP_LEFT_SHIFT_CTYPE(signed char, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
-  OP_LEFT_SHIFT_CTYPE(short, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
-  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
-  OP_LEFT_SHIFT_CTYPE(long, true)
-  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
-  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
-  OP_LEFT_SHIFT_CTYPE(long long, true)
-#if 0
-  OP_LEFT_SHIFT_CTYPE(half, false)
-  OP_LEFT_SHIFT_CTYPE(float, false)
-  OP_LEFT_SHIFT_CTYPE(double, false)
-#endif
-#undef OP_LEFT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this << sh;
-    } else {
-      int sh = op2.to_int();
-      return *this << sh;
-    }
-  }
-
-#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
-  INLINE ap_private operator>>(const TYPE op) const { \
-    if (op >= _AP_W) {                                \
-      if (isNegative())                               \
-        return ap_private(-1);                        \
-      else                                            \
-        return ap_private(0);                         \
-    }                                                 \
-    if ((SIGNED) && op < 0) return *this << (0 - op); \
-    if (_AP_S)                                        \
-      return ashr(op);                                \
-    else                                              \
-      return lshr(op);                                \
-  }
-
-  // OP_RIGHT_SHIFT_CTYPE(bool, false)
-  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
-  OP_RIGHT_SHIFT_CTYPE(signed char, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
-  OP_RIGHT_SHIFT_CTYPE(short, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
-  OP_RIGHT_SHIFT_CTYPE(int, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
-  OP_RIGHT_SHIFT_CTYPE(long, true)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
-  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
-  OP_RIGHT_SHIFT_CTYPE(long long, true)
-#if 0
-  OP_RIGHT_SHIFT_CTYPE(half, false)
-  OP_RIGHT_SHIFT_CTYPE(float, false)
-  OP_RIGHT_SHIFT_CTYPE(double, false)
-#endif
-#undef OP_RIGHT_SHIFT_CTYPE
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
-    if (_AP_S2 == false) {
-      uint32_t sh = op2.to_uint();
-      return *this >> sh;
-    } else {
-      int sh = op2.to_int();
-      return *this >> sh;
-    }
-  }
-
-  /// Shift assign
-  //------------------------------------------------------------------
-  // TODO call clearUnusedBits ?
-#define OP_ASSIGN_AP(Sym)                                                    \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(int op) {                               \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }                                                                          \
-  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }                                                                          \
-  template <int _AP_W2, bool _AP_S2>                                         \
-  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
-    *this = operator Sym(op);                                                \
-    return *this;                                                            \
-  }
-  OP_ASSIGN_AP(>>)
-  OP_ASSIGN_AP(<<)
-#undef OP_ASSIGN_AP
-
-  /// Comparisons
-  //-----------------------------------------------------------------
-  INLINE bool operator==(const ap_private& RHS) const {
-    // Get some facts about the number of bits used in the two operands.
-    uint32_t n1 = getActiveBits();
-    uint32_t n2 = RHS.getActiveBits();
-
-    // If the number of bits isn't the same, they aren't equal
-    if (n1 != n2) return false;
-
-    // If the number of bits fits in a word, we only need to compare the low
-    // word.
-    if (n1 <= APINT_BITS_PER_WORD) return pVal[0] == RHS.get_pVal(0);
-
-    // Otherwise, compare everything
-    for (int i = whichWord(n1 - 1); i >= 0; --i)
-      if (pVal[i] != RHS.get_pVal(i)) return false;
-    return true;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W, _AP_W2),
-    };
-    ap_private<_AP_MAX_W, false> lhs(*this);
-    ap_private<_AP_MAX_W, false> rhs(op);
-    return lhs == rhs;
-  }
-
-  INLINE bool operator==(uint64_t Val) const {
-    uint32_t n = getActiveBits();
-    if (n <= APINT_BITS_PER_WORD)
-      return pVal[0] == Val;
-    else
-      return false;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this == op);
-  }
-
-  template <bool _AP_S1>
-  INLINE bool operator!=(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !((*this) == RHS);
-  }
-
-  INLINE bool operator!=(uint64_t Val) const { return !((*this) == Val); }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this > op);
-  }
-
-  INLINE bool operator<(const ap_private& op) const {
-    return _AP_S ? slt(op) : ult(op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
-    else if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ult(rhs);
-      else
-        return lhs.slt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ult(rhs);
-    else
-      return lhs.slt(rhs);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
-    return !(*this < op);
-  }
-
-  INLINE bool operator>(const ap_private& op) const {
-    return _AP_S ? sgt(op) : ugt(op);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
-    enum {
-      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
-    };
-    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
-    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
-    if (_AP_S == _AP_S2)
-      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
-    else if (_AP_S)
-      if (_AP_W2 >= _AP_W)
-        return lhs.ugt(rhs);
-      else
-        return lhs.sgt(rhs);
-    else if (_AP_W >= _AP_W2)
-      return lhs.ugt(rhs);
-    else
-      return lhs.sgt(rhs);
-  }
-
-  /// Bit and Part Select
-  //--------------------------------------------------------------
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
-    return _private_range_ref<_AP_W, _AP_S>(
-        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
-  }
-
-  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> range(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> range(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return _private_range_ref<_AP_W, _AP_S>(const_cast<ap_private*>(this), Hi, Lo);
-  }
-
-  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
-  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
-      const ap_private<_AP_W2, _AP_S2>& HiIdx,
-      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
-    int Hi = HiIdx.to_int();
-    int Lo = LoIdx.to_int();
-    return this->range(Hi, Lo);
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
-    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
-  }
-
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
-      const ap_private<_AP_W2, _AP_S2>& index) const {
-    return _private_bit_ref<_AP_W, _AP_S>(
-        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
-//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
-//                                                                         a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, ap_private, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
-//                                                                       a2);
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                    &a2) const {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-//            a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, ap_private, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(
-//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, ap_private, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
-//  }
-
-  INLINE ap_private<_AP_W, false> get() const {
-    ap_private<_AP_W, false> ret(*this);
-    return ret;
-  }
-
-  template <int _AP_W3>
-  INLINE void set(const ap_private<_AP_W3, false>& val) {
-    operator=(ap_private<_AP_W3, _AP_S>(val));
-  }
-
-  ///
-  /// @name Value Tests
-  ///
-  /// This tests the high bit of this ap_private to determine if it is set.
-  /// @returns true if this ap_private is negative, false otherwise
-  /// @brief Determine sign of this ap_private.
-  INLINE bool isNegative() const {
-    // just for get rid of warnings
-    enum { shift = (_AP_W - APINT_BITS_PER_WORD * (_AP_N - 1) - 1) };
-    static const uint64_t mask = 1ULL << (shift);
-    return _AP_S && (pVal[_AP_N - 1] & mask);
-  }
-
-  /// This tests the high bit of the ap_private to determine if it is unset.
-  /// @brief Determine if this ap_private Value is positive (not negative).
-  INLINE bool isPositive() const { return !isNegative(); }
-
-  /// This tests if the value of this ap_private is strictly positive (> 0).
-  /// @returns true if this ap_private is Positive and not zero.
-  /// @brief Determine if this ap_private Value is strictly positive.
-  INLINE bool isStrictlyPositive() const {
-    return isPositive() && (*this) != 0;
-  }
-
-  /// This checks to see if the value has all bits of the ap_private are set or
-  /// not.
-  /// @brief Determine if all bits are set
-  INLINE bool isAllOnesValue() const { return countPopulation() == _AP_W; }
-
-  /// This checks to see if the value of this ap_private is the maximum unsigned
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the largest unsigned value.
-  INLINE bool isMaxValue() const { return countPopulation() == _AP_W; }
-
-  /// This checks to see if the value of this ap_private is the maximum signed
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the largest signed value.
-  INLINE bool isMaxSignedValue() const {
-    return !isNegative() && countPopulation() == _AP_W - 1;
-  }
-
-  /// This checks to see if the value of this ap_private is the minimum unsigned
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the smallest unsigned value.
-  INLINE bool isMinValue() const { return countPopulation() == 0; }
-
-  /// This checks to see if the value of this ap_private is the minimum signed
-  /// value for the ap_private's bit width.
-  /// @brief Determine if this is the smallest signed value.
-  INLINE bool isMinSignedValue() const {
-    return isNegative() && countPopulation() == 1;
-  }
-
-  /// This function returns a pointer to the internal storage of the ap_private.
-  /// This is useful for writing out the ap_private in binary form without any
-  /// conversions.
-  INLINE const uint64_t* getRawData() const { return &pVal[0]; }
-
-  // Square Root - this method computes and returns the square root of "this".
-  // Three mechanisms are used for computation. For small values (<= 5 bits),
-  // a table lookup is done. This gets some performance for common cases. For
-  // values using less than 52 bits, the value is converted to double and then
-  // the libc sqrt function is called. The result is rounded and then converted
-  // back to a uint64_t which is then used to construct the result. Finally,
-  // the Babylonian method for computing square roots is used.
-  INLINE ap_private sqrt() const {
-    // Determine the magnitude of the value.
-    uint32_t magnitude = getActiveBits();
-
-    // Use a fast table for some small values. This also gets rid of some
-    // rounding errors in libc sqrt for small values.
-    if (magnitude <= 5) {
-      static const uint8_t results[32] = {
-          /*     0 */ 0,
-          /*  1- 2 */ 1, 1,
-          /*  3- 6 */ 2, 2, 2, 2,
-          /*  7-12 */ 3, 3, 3, 3, 3, 3,
-          /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
-          /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-          /*    31 */ 6};
-      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ results[get_VAL()]);
-    }
-
-    // If the magnitude of the value fits in less than 52 bits (the precision of
-    // an IEEE double precision floating point value), then we can use the
-    // libc sqrt function which will probably use a hardware sqrt computation.
-    // This should be faster than the algorithm below.
-    if (magnitude < 52) {
-#ifdef _MSC_VER
-      // Amazingly, VC++ doesn't have round().
-      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
-                                      uint64_t(::sqrt(double(get_VAL()))) +
-                                      0.5);
-#else
-      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
-                                      uint64_t(
-                                          ::round(::sqrt(double(get_VAL())))));
-#endif
-    }
-
-    // Okay, all the short cuts are exhausted. We must compute it. The following
-    // is a classical Babylonian method for computing the square root. This code
-    // was adapted to APINt from a wikipedia article on such computations.
-    // See http://www.wikipedia.org/ and go to the page named
-    // Calculate_an_integer_square_root.
-    uint32_t nbits = BitWidth, i = 4;
-    ap_private<_AP_W, _AP_S> testy(16);
-    ap_private<_AP_W, _AP_S> x_old(/*BitWidth,*/ 1);
-    ap_private<_AP_W, _AP_S> x_new(0);
-    ap_private<_AP_W, _AP_S> two(/*BitWidth,*/ 2);
-
-    // Select a good starting value using binary logarithms.
-    for (;; i += 2, testy = testy.shl(2))
-      if (i >= nbits || this->ule(testy)) {
-        x_old = x_old.shl(i / 2);
-        break;
-      }
-
-    // Use the Babylonian method to arrive at the integer square root:
-    for (;;) {
-      x_new = (this->udiv(x_old) + x_old).udiv(two);
-      if (x_old.ule(x_new)) break;
-      x_old = x_new;
-    }
-
-    // Make sure we return the closest approximation
-    // NOTE: The rounding calculation below is correct. It will produce an
-    // off-by-one discrepancy with results from pari/gp. That discrepancy has
-    // been
-    // determined to be a rounding issue with pari/gp as it begins to use a
-    // floating point representation after 192 bits. There are no discrepancies
-    // between this algorithm and pari/gp for bit widths < 192 bits.
-    ap_private<_AP_W, _AP_S> square(x_old * x_old);
-    ap_private<_AP_W, _AP_S> nextSquare((x_old + 1) * (x_old + 1));
-    if (this->ult(square))
-      return x_old;
-    else if (this->ule(nextSquare)) {
-      ap_private<_AP_W, _AP_S> midpoint((nextSquare - square).udiv(two));
-      ap_private<_AP_W, _AP_S> offset(*this - square);
-      if (offset.ult(midpoint))
-        return x_old;
-      else
-        return x_old + 1;
-    } else
-      assert(0 && "Error in ap_private<_AP_W, _AP_S>::sqrt computation");
-    return x_old + 1;
-  }
-
-  ///
-  /// @Assignment Operators
-  ///
-  /// @returns *this after assignment of RHS.
-  /// @brief Copy assignment operator.
-  INLINE ap_private& operator=(const ap_private& RHS) {
-    if (this != &RHS) memcpy(pVal, RHS.get_pVal(), _AP_N * APINT_WORD_SIZE);
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE ap_private& operator=(const volatile ap_private& RHS) {
-    if (this != &RHS)
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
-    clearUnusedBits();
-    return *this;
-  }
-  INLINE void operator=(const ap_private& RHS) volatile {
-    if (this != &RHS)
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
-    clearUnusedBits();
-  }
-  INLINE void operator=(const volatile ap_private& RHS) volatile {
-    if (this != &RHS)
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
-    clearUnusedBits();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
-    if (_AP_S1)
-      cpSextOrTrunc(RHS);
-    else
-      cpZextOrTrunc(RHS);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
-    if (_AP_S1)
-      cpSextOrTrunc(RHS);
-    else
-      cpZextOrTrunc(RHS);
-    clearUnusedBits();
-    return *this;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    *this = ap_private<_AP_W2, false>(op2);
-    return *this;
-  }
-
-#if 0
-    template<int _AP_W1, bool _AP_S1>
-    INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1, true>& RHS) {
-        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
-        if (RHS.isNegative()) {
-            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
-            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
-        } else {
-            pVal[0] = RHS.get_VAL();
-            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
-        }
-        clearUnusedBits();
-        return *this;
-    }
-
-    template<int _AP_W1, bool _AP_S1>
-    INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1, true>& RHS) {
-        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
-        if (RHS.isNegative()) {
-            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
-            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
-        } else {
-            pVal[0] = RHS.get_VAL();
-            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
-        }
-        clearUnusedBits();
-        return *this;
-    }
-#endif
-
-/// from all c types.
-#define ASSIGN_OP_FROM_INT(C_TYPE, _AP_W2, _AP_S2) \
-  INLINE ap_private& operator=(const C_TYPE rhs) { \
-    ap_private<(_AP_W2), (_AP_S2)> tmp = rhs;      \
-    operator=(tmp);                                \
-    return *this;                                  \
-  }
-
-  ASSIGN_OP_FROM_INT(bool, 1, false)
-  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
-  ASSIGN_OP_FROM_INT(signed char, 8, true)
-  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
-  ASSIGN_OP_FROM_INT(short, sizeof(short) * 8, true)
-  ASSIGN_OP_FROM_INT(unsigned short, sizeof(unsigned short) * 8, false)
-  ASSIGN_OP_FROM_INT(int, sizeof(int) * 8, true)
-  ASSIGN_OP_FROM_INT(unsigned int, sizeof(unsigned int) * 8, false)
-  ASSIGN_OP_FROM_INT(long, sizeof(long) * 8, true)
-  ASSIGN_OP_FROM_INT(unsigned long, sizeof(unsigned long) * 8, false)
-  ASSIGN_OP_FROM_INT(ap_slong, sizeof(ap_slong) * 8, true)
-  ASSIGN_OP_FROM_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-#undef ASSIGN_OP_FROM_INT
-
-  /// from c string.
-  // XXX this is a must, to prevent pointer being converted to bool.
-  INLINE ap_private& operator=(const char* s) {
-    ap_private tmp(s); // XXX direct initialization, as ctor is explicit.
-    operator=(tmp);
-    return *this;
-  }
-
-  ///
-  /// @name Unary Operators
-  ///
-  /// @returns a new ap_private value representing *this incremented by one
-  /// @brief Postfix increment operator.
-  INLINE const ap_private operator++(int) {
-    ap_private API(*this);
-    ++(*this);
-    return API;
-  }
-
-  /// @returns *this incremented by one
-  /// @brief Prefix increment operator.
-  INLINE ap_private& operator++() {
-    ap_private_ops::add_1(pVal, pVal, _AP_N, 1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// @returns a new ap_private representing *this decremented by one.
-  /// @brief Postfix decrement operator.
-  INLINE const ap_private operator--(int) {
-    ap_private API(*this);
-    --(*this);
-    return API;
-  }
-
-  /// @returns *this decremented by one.
-  /// @brief Prefix decrement operator.
-  INLINE ap_private& operator--() {
-    ap_private_ops::sub_1(pVal, _AP_N, 1);
-    clearUnusedBits();
-    return *this;
-  }
-
-  /// Performs a bitwise complement operation on this ap_private.
-  /// @returns an ap_private that is the bitwise complement of *this
-  /// @brief Unary bitwise complement operator.
-  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
-    ap_private<_AP_W + !_AP_S, true> Result(*this);
-    Result.flip();
-    return Result;
-  }
-
-  /// Negates *this using two's complement logic.
-  /// @returns An ap_private value representing the negation of *this.
-  /// @brief Unary negation operator
-  INLINE typename RType<1, false>::minus operator-() const {
-    return ap_private<1, false>(0) - (*this);
-  }
-
-  /// Performs logical negation operation on this ap_private.
-  /// @returns true if *this is zero, false otherwise.
-  /// @brief Logical negation operator.
-  INLINE bool operator!() const {
-    for (int i = 0; i < _AP_N; ++i)
-      if (pVal[i]) return false;
-    return true;
-  }
-
-  template <bool _AP_S1>
-  INLINE ap_private<_AP_W, _AP_S || _AP_S1> And(
-      const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return this->operator&(RHS);
-  }
-  template <bool _AP_S1>
-  INLINE ap_private Or(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return this->operator|(RHS);
-  }
-  template <bool _AP_S1>
-  INLINE ap_private Xor(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return this->operator^(RHS);
-  }
-
-  INLINE ap_private Mul(const ap_private& RHS) const {
-    ap_private Result(*this);
-    Result *= RHS;
-    return Result;
-  }
-
-  INLINE ap_private Add(const ap_private& RHS) const {
-    ap_private Result(0);
-    ap_private_ops::add(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
-                        _AP_N, _AP_S, _AP_S);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  INLINE ap_private Sub(const ap_private& RHS) const {
-    ap_private Result(0);
-    ap_private_ops::sub(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
-                        _AP_N, _AP_S, _AP_S);
-    Result.clearUnusedBits();
-    return Result;
-  }
-
-  /// Arithmetic right-shift this ap_private by shiftAmt.
-  /// @brief Arithmetic right-shift function.
-  INLINE ap_private ashr(uint32_t shiftAmt) const {
-    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
-    // Handle a degenerate case
-    if (shiftAmt == 0) return ap_private(*this);
-
-    // If all the bits were shifted out, the result is, technically, undefined.
-    // We return -1 if it was negative, 0 otherwise. We check this early to
-    // avoid
-    // issues in the algorithm below.
-    if (shiftAmt == BitWidth) {
-      if (isNegative())
-        return ap_private(-1);
-      else
-        return ap_private(0);
-    }
-
-    // Create some space for the result.
-    ap_private Retval(0);
-    uint64_t* val = Retval.get_pVal();
-
-    // Compute some values needed by the following shift algorithms
-    uint32_t wordShift =
-        shiftAmt % APINT_BITS_PER_WORD;               // bits to shift per word
-    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
-    uint32_t breakWord = _AP_N - 1 - offset;          // last word affected
-    uint32_t bitsInWord = whichBit(BitWidth); // how many bits in last word?
-    if (bitsInWord == 0) bitsInWord = APINT_BITS_PER_WORD;
-
-    // If we are shifting whole words, just move whole words
-    if (wordShift == 0) {
-      // Move the words containing significant bits
-      for (uint32_t i = 0; i <= breakWord; ++i)
-        val[i] = pVal[i + offset]; // move whole word
-
-      // Adjust the top significant word for sign bit fill, if negative
-      if (isNegative())
-        if (bitsInWord < APINT_BITS_PER_WORD)
-          val[breakWord] |= ~0ULL << (bitsInWord); // set high bits
-    } else {
-      // Shift the low order words
-      for (uint32_t i = 0; i < breakWord; ++i) {
-        // This combines the shifted corresponding word with the low bits from
-        // the next word (shifted into this word's high bits).
-        val[i] = ((pVal[i + offset]) >> (wordShift));
-        val[i] |= ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
-      }
-
-      // Shift the break word. In this case there are no bits from the next word
-      // to include in this word.
-      val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
-
-      // Deal with sign extenstion in the break word, and possibly the word
-      // before
-      // it.
-      if (isNegative()) {
-        if (wordShift > bitsInWord) {
-          if (breakWord > 0)
-            val[breakWord - 1] |=
-                ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
-          val[breakWord] |= ~0ULL;
-        } else
-          val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
-      }
-    }
-
-    // Remaining words are 0 or -1, just assign them.
-    uint64_t fillValue = (isNegative() ? ~0ULL : 0);
-    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = fillValue;
-    Retval.clearUnusedBits();
-    return Retval;
-  }
-
-  /// Logical right-shift this ap_private by shiftAmt.
-  /// @brief Logical right-shift function.
-  INLINE ap_private lshr(uint32_t shiftAmt) const {
-    // If all the bits were shifted out, the result is 0. This avoids issues
-    // with shifting by the size of the integer type, which produces undefined
-    // results. We define these "undefined results" to always be 0.
-    if (shiftAmt == BitWidth) return ap_private(0);
-
-    // If none of the bits are shifted out, the result is *this. This avoids
-    // issues with shifting byt he size of the integer type, which produces
-    // undefined results in the code below. This is also an optimization.
-    if (shiftAmt == 0) return ap_private(*this);
-
-    // Create some space for the result.
-    ap_private Retval(0);
-    uint64_t* val = Retval.get_pVal();
-
-    // If we are shifting less than a word, compute the shift with a simple
-    // carry
-    if (shiftAmt < APINT_BITS_PER_WORD) {
-      uint64_t carry = 0;
-      for (int i = _AP_N - 1; i >= 0; --i) {
-        val[i] = ((pVal[i]) >> (shiftAmt)) | carry;
-        carry = (pVal[i]) << (APINT_BITS_PER_WORD - shiftAmt);
-      }
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Compute some values needed by the remaining shift algorithms
-    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
-    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
-
-    // If we are shifting whole words, just move whole words
-    if (wordShift == 0) {
-      for (uint32_t i = 0; i < _AP_N - offset; ++i) val[i] = pVal[i + offset];
-      for (uint32_t i = _AP_N - offset; i < _AP_N; i++) val[i] = 0;
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Shift the low order words
-    uint32_t breakWord = _AP_N - offset - 1;
-    for (uint32_t i = 0; i < breakWord; ++i)
-      val[i] = ((pVal[i + offset]) >> (wordShift)) |
-               ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
-    // Shift the break word.
-    val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
-
-    // Remaining words are 0
-    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = 0;
-    Retval.clearUnusedBits();
-    return Retval;
-  }
-
-  /// Left-shift this ap_private by shiftAmt.
-  /// @brief Left-shift function.
-  INLINE ap_private shl(uint32_t shiftAmt) const {
-    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
-    // If all the bits were shifted out, the result is 0. This avoids issues
-    // with shifting by the size of the integer type, which produces undefined
-    // results. We define these "undefined results" to always be 0.
-    if (shiftAmt == BitWidth) return ap_private(0);
-
-    // If none of the bits are shifted out, the result is *this. This avoids a
-    // lshr by the words size in the loop below which can produce incorrect
-    // results. It also avoids the expensive computation below for a common
-    // case.
-    if (shiftAmt == 0) return ap_private(*this);
-
-    // Create some space for the result.
-    ap_private Retval(0);
-    uint64_t* val = Retval.get_pVal();
-    // If we are shifting less than a word, do it the easy way
-    if (shiftAmt < APINT_BITS_PER_WORD) {
-      uint64_t carry = 0;
-      for (int i = 0; i < _AP_N; i++) {
-        val[i] = ((pVal[i]) << (shiftAmt)) | carry;
-        carry = (pVal[i]) >> (APINT_BITS_PER_WORD - shiftAmt);
-      }
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Compute some values needed by the remaining shift algorithms
-    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
-    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
-
-    // If we are shifting whole words, just move whole words
-    if (wordShift == 0) {
-      for (uint32_t i = 0; i < offset; i++) val[i] = 0;
-      for (int i = offset; i < _AP_N; i++) val[i] = pVal[i - offset];
-      Retval.clearUnusedBits();
-      return Retval;
-    }
-
-    // Copy whole words from this to Result.
-    uint32_t i = _AP_N - 1;
-    for (; i > offset; --i)
-      val[i] = (pVal[i - offset]) << (wordShift) |
-               (pVal[i - offset - 1]) >> (APINT_BITS_PER_WORD - wordShift);
-    val[offset] = (pVal[0]) << (wordShift);
-    for (i = 0; i < offset; ++i) val[i] = 0;
-    Retval.clearUnusedBits();
-    return Retval;
-  }
-
-  INLINE ap_private rotl(uint32_t rotateAmt) const {
-    if (rotateAmt == 0) return ap_private(*this);
-    // Don't get too fancy, just use existing shift/or facilities
-    ap_private hi(*this);
-    ap_private lo(*this);
-    hi.shl(rotateAmt);
-    lo.lshr(BitWidth - rotateAmt);
-    return hi | lo;
-  }
-
-  INLINE ap_private rotr(uint32_t rotateAmt) const {
-    if (rotateAmt == 0) return ap_private(*this);
-    // Don't get too fancy, just use existing shift/or facilities
-    ap_private hi(*this);
-    ap_private lo(*this);
-    lo.lshr(rotateAmt);
-    hi.shl(BitWidth - rotateAmt);
-    return hi | lo;
-  }
-
-  /// Perform an unsigned divide operation on this ap_private by RHS. Both this
-  /// and
-  /// RHS are treated as unsigned quantities for purposes of this division.
-  /// @returns a new ap_private value containing the division result
-  /// @brief Unsigned division operation.
-  INLINE ap_private udiv(const ap_private& RHS) const {
-    // Get some facts about the LHS and RHS number of bits and words
-    uint32_t rhsBits = RHS.getActiveBits();
-    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
-    assert(rhsWords && "Divided by zero???");
-    uint32_t lhsBits = this->getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
-
-    // Deal with some degenerate cases
-    if (!lhsWords)
-      // 0 / X ===> 0
-      return ap_private(0);
-    else if (lhsWords < rhsWords || this->ult(RHS)) {
-      // X / Y ===> 0, iff X < Y
-      return ap_private(0);
-    } else if (*this == RHS) {
-      // X / X ===> 1
-      return ap_private(1);
-    } else if (lhsWords == 1 && rhsWords == 1) {
-      // All high words are zero, just use native divide
-      return ap_private(this->pVal[0] / RHS.get_pVal(0));
-    }
-
-    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-    ap_private Quotient(0); // to hold result.
-    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, &Quotient,
-                           (ap_private*)0);
-    return Quotient;
-  }
-
-  /// Signed divide this ap_private by ap_private RHS.
-  /// @brief Signed division function for ap_private.
-  INLINE ap_private sdiv(const ap_private& RHS) const {
-    if (isNegative())
-      if (RHS.isNegative())
-        return (-(*this)).udiv(-RHS);
-      else
-        return -((-(*this)).udiv(RHS));
-    else if (RHS.isNegative())
-      return -(this->udiv((ap_private)(-RHS)));
-    return this->udiv(RHS);
-  }
-
-  /// Perform an unsigned remainder operation on this ap_private with RHS being
-  /// the
-  /// divisor. Both this and RHS are treated as unsigned quantities for purposes
-  /// of this operation. Note that this is a true remainder operation and not
-  /// a modulo operation because the sign follows the sign of the dividend
-  /// which is *this.
-  /// @returns a new ap_private value containing the remainder result
-  /// @brief Unsigned remainder operation.
-  INLINE ap_private urem(const ap_private& RHS) const {
-    // Get some facts about the LHS
-    uint32_t lhsBits = getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
-
-    // Get some facts about the RHS
-    uint32_t rhsBits = RHS.getActiveBits();
-    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
-    assert(rhsWords && "Performing remainder operation by zero ???");
-
-    // Check the degenerate cases
-    if (lhsWords == 0) {
-      // 0 % Y ===> 0
-      return ap_private(0);
-    } else if (lhsWords < rhsWords || this->ult(RHS)) {
-      // X % Y ===> X, iff X < Y
-      return *this;
-    } else if (*this == RHS) {
-      // X % X == 0;
-      return ap_private(0);
-    } else if (lhsWords == 1) {
-      // All high words are zero, just use native remainder
-      return ap_private(pVal[0] % RHS.get_pVal(0));
-    }
-
-    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-    ap_private Remainder(0);
-    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, (ap_private*)(0),
-                           &Remainder);
-    return Remainder;
-  }
-
-  INLINE ap_private urem(uint64_t RHS) const {
-    // Get some facts about the LHS
-    uint32_t lhsBits = getActiveBits();
-    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
-    // Get some facts about the RHS
-    uint32_t rhsWords = 1; //! rhsBits ? 0 : (ap_private<_AP_W,
-                           //! _AP_S>::whichWord(rhsBits - 1) + 1);
-    assert(rhsWords && "Performing remainder operation by zero ???");
-    // Check the degenerate cases
-    if (lhsWords == 0) {
-      // 0 % Y ===> 0
-      return ap_private(0);
-    } else if (lhsWords < rhsWords || this->ult(RHS)) {
-      // X % Y ===> X, iff X < Y
-      return *this;
-    } else if (*this == RHS) {
-      // X % X == 0;
-      return ap_private(0);
-    } else if (lhsWords == 1) {
-      // All high words are zero, just use native remainder
-      return ap_private(pVal[0] % RHS);
-    }
-
-    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
-    ap_private Remainder(0);
-    divide(*this, lhsWords, RHS, (ap_private*)(0), &Remainder);
-    return Remainder;
-  }
-
-  /// Signed remainder operation on ap_private.
-  /// @brief Function for signed remainder operation.
-  INLINE ap_private srem(const ap_private& RHS) const {
-    if (isNegative()) {
-      ap_private lhs = -(*this);
-      if (RHS.isNegative()) {
-        ap_private rhs = -RHS;
-        return -(lhs.urem(rhs));
-      } else
-        return -(lhs.urem(RHS));
-    } else if (RHS.isNegative()) {
-      ap_private rhs = -RHS;
-      return this->urem(rhs);
-    }
-    return this->urem(RHS);
-  }
-
-  /// Signed remainder operation on ap_private.
-  /// @brief Function for signed remainder operation.
-  INLINE ap_private srem(int64_t RHS) const {
-    if (isNegative())
-      if (RHS < 0)
-        return -((-(*this)).urem(-RHS));
-      else
-        return -((-(*this)).urem(RHS));
-    else if (RHS < 0)
-      return this->urem(-RHS);
-    return this->urem(RHS);
-  }
-
-  /// Compares this ap_private with RHS for the validity of the equality
-  /// relationship.
-  /// @returns true if *this == Val
-  /// @brief Equality comparison.
-  template <bool _AP_S1>
-  INLINE bool eq(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return (*this) == RHS;
-  }
-
-  /// Compares this ap_private with RHS for the validity of the inequality
-  /// relationship.
-  /// @returns true if *this != Val
-  /// @brief Inequality comparison
-  template <bool _AP_S1>
-  INLINE bool ne(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !((*this) == RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the less-than relationship.
-  /// @returns true if *this < RHS when both are considered unsigned.
-  /// @brief Unsigned less than comparison
-  template <bool _AP_S1>
-  INLINE bool ult(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    // Get active bit length of both operands
-    uint32_t n1 = getActiveBits();
-    uint32_t n2 = RHS.getActiveBits();
-
-    // If magnitude of LHS is less than RHS, return true.
-    if (n1 < n2) return true;
-
-    // If magnitude of RHS is greather than LHS, return false.
-    if (n2 < n1) return false;
-
-    // If they bot fit in a word, just compare the low order word
-    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
-      return pVal[0] < RHS.get_pVal(0);
-
-    // Otherwise, compare all words
-    uint32_t topWord = whichWord(AESL_std::max(n1, n2) - 1);
-    for (int i = topWord; i >= 0; --i) {
-      if (pVal[i] > RHS.get_pVal(i)) return false;
-      if (pVal[i] < RHS.get_pVal(i)) return true;
-    }
-    return false;
-  }
-
-  INLINE bool ult(uint64_t RHS) const {
-    // Get active bit length of both operands
-    uint32_t n1 = getActiveBits();
-    uint32_t n2 =
-        64 - ap_private_ops::CountLeadingZeros_64(RHS); // RHS.getActiveBits();
-
-    // If magnitude of LHS is less than RHS, return true.
-    if (n1 < n2) return true;
-
-    // If magnitude of RHS is greather than LHS, return false.
-    if (n2 < n1) return false;
-
-    // If they bot fit in a word, just compare the low order word
-    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
-      return pVal[0] < RHS;
-    assert(0);
-  }
-
-  template <bool _AP_S1>
-  INLINE bool slt(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    ap_private lhs(*this);
-    ap_private<_AP_W, _AP_S1> rhs(RHS);
-    bool lhsNeg = isNegative();
-    bool rhsNeg = rhs.isNegative();
-    if (lhsNeg) {
-      // Sign bit is set so perform two's complement to make it positive
-      lhs.flip();
-      lhs++;
-    }
-    if (rhsNeg) {
-      // Sign bit is set so perform two's complement to make it positive
-      rhs.flip();
-      rhs++;
-    }
-
-    // Now we have unsigned values to compare so do the comparison if necessary
-    // based on the negativeness of the values.
-    if (lhsNeg)
-      if (rhsNeg)
-        return lhs.ugt(rhs);
-      else
-        return true;
-    else if (rhsNeg)
-      return false;
-    else
-      return lhs.ult(rhs);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered unsigned.
-  /// @brief Unsigned less or equal comparison
-  template <bool _AP_S1>
-  INLINE bool ule(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return ult(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the less-or-equal relationship.
-  /// @returns true if *this <= RHS when both are considered signed.
-  /// @brief Signed less or equal comparison
-  template <bool _AP_S1>
-  INLINE bool sle(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return slt(RHS) || eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered unsigned.
-  /// @brief Unsigned greather than comparison
-  template <bool _AP_S1>
-  INLINE bool ugt(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !ult(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// the validity of the greater-than relationship.
-  /// @returns true if *this > RHS when both are considered signed.
-  /// @brief Signed greather than comparison
-  template <bool _AP_S1>
-  INLINE bool sgt(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !slt(RHS) && !eq(RHS);
-  }
-
-  /// Regards both *this and RHS as unsigned quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered unsigned.
-  /// @brief Unsigned greater or equal comparison
-  template <bool _AP_S1>
-  INLINE bool uge(const ap_private<_AP_W, _AP_S>& RHS) const {
-    return !ult(RHS);
-  }
-
-  /// Regards both *this and RHS as signed quantities and compares them for
-  /// validity of the greater-or-equal relationship.
-  /// @returns true if *this >= RHS when both are considered signed.
-  /// @brief Signed greather or equal comparison
-  template <bool _AP_S1>
-  INLINE bool sge(const ap_private<_AP_W, _AP_S1>& RHS) const {
-    return !slt(RHS);
-  }
-
-  // Sign extend to a new width.
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpSext(const ap_private<_AP_W1, _AP_S1>& that) {
-    assert(_AP_W1 < BitWidth && "Invalid ap_private SignExtend request");
-    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
-    // If the sign bit isn't set, this is the same as zext.
-    if (!that.isNegative()) {
-      cpZext(that);
-      return;
-    }
-
-    // The sign bit is set. First, get some facts
-    enum { wordBits = _AP_W1 % APINT_BITS_PER_WORD };
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    // Mask the high order word appropriately
-    if (_AP_N1 == _AP_N) {
-      enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
-      // The extension is contained to the wordsBefore-1th word.
-      static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
-      pVal[_AP_N - 1] |= mask;
-      return;
-    }
-
-    enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
-    // The extension is contained to the wordsBefore-1th word.
-    static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
-    int i;
-    for (i = 0; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
-    pVal[i - 1] |= mask;
-    for (; i < _AP_N - 1; i++) pVal[i] = ~0ULL;
-    pVal[i] = ~0ULL;
-    clearUnusedBits();
-    return;
-  }
-
-  //  Zero extend to a new width.
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpZext(const ap_private<_AP_W1, _AP_S1>& that) {
-    assert(_AP_W1 < BitWidth && "Invalid ap_private ZeroExtend request");
-    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
-    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
-    int i = 0;
-    for (; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
-    for (; i < _AP_N; ++i) pVal[i] = 0;
-    clearUnusedBits();
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpZextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
-    if (BitWidth > _AP_W1)
-      cpZext(that);
-    else {
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
-      clearUnusedBits();
-    }
-  }
-
-  template <int _AP_W1, bool _AP_S1>
-  INLINE void cpSextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
-    if (BitWidth > _AP_W1)
-      cpSext(that);
-    else {
-      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
-      clearUnusedBits();
-    }
-  }
-
-  /// @}
-  /// @name Value Characterization Functions
-  /// @{
-
-  /// @returns the total number of bits.
-  INLINE uint32_t getBitWidth() const { return BitWidth; }
-
-  /// Here one word's bitwidth equals to that of uint64_t.
-  /// @returns the number of words to hold the integer value of this ap_private.
-  /// @brief Get the number of words.
-  INLINE uint32_t getNumWords() const {
-    return (BitWidth + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
-  }
-
-  /// This function returns the number of active bits which is defined as the
-  /// bit width minus the number of leading zeros. This is used in several
-  /// computations to see how "wide" the value is.
-  /// @brief Compute the number of active bits in the value
-  INLINE uint32_t getActiveBits() const {
-    uint32_t bits = BitWidth - countLeadingZeros();
-    return bits ? bits : 1;
-  }
-
-  /// This method attempts to return the value of this ap_private as a zero
-  /// extended
-  /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
-  /// uint64_t. Otherwise an assertion will result.
-  /// @brief Get zero extended value
-  INLINE uint64_t getZExtValue() const {
-    assert(getActiveBits() <= 64 && "Too many bits for uint64_t");
-    return *pVal;
-  }
-
-  /// This method attempts to return the value of this ap_private as a sign
-  /// extended
-  /// int64_t. The bit width must be <= 64 or the value must fit within an
-  /// int64_t. Otherwise an assertion will result.
-  /// @brief Get sign extended value
-  INLINE int64_t getSExtValue() const {
-    assert(getActiveBits() <= 64 && "Too many bits for int64_t");
-    return int64_t(pVal[0]);
-  }
-
-  /// This method determines how many bits are required to hold the ap_private
-  /// equivalent of the string given by \p str of length \p slen.
-  /// @brief Get bits required for string value.
-  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
-                                       uint8_t radix) {
-    assert(str != 0 && "Invalid value string");
-    assert(slen > 0 && "Invalid string length");
-
-    // Each computation below needs to know if its negative
-    uint32_t isNegative = str[0] == '-';
-    if (isNegative) {
-      slen--;
-      str++;
-    }
-    // For radixes of power-of-two values, the bits required is accurately and
-    // easily computed
-    if (radix == 2) return slen + isNegative;
-    if (radix == 8) return slen * 3 + isNegative;
-    if (radix == 16) return slen * 4 + isNegative;
-
-    // Otherwise it must be radix == 10, the hard case
-    assert(radix == 10 && "Invalid radix");
-
-    // Convert to the actual binary value.
-    // ap_private<_AP_W, _AP_S> tmp(sufficient, str, slen, radix);
-
-    // Compute how many bits are required.
-    // return isNegative + tmp.logBase2() + 1;
-    return isNegative + slen * 4;
-  }
-
-  /// countLeadingZeros - This function is an ap_private version of the
-  /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number
-  /// of zeros from the most significant bit to the first one bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the most significant bit to the first
-  /// one bits.
-  INLINE uint32_t countLeadingZeros() const {
-    enum {
-      msw_bits = (BitWidth % APINT_BITS_PER_WORD)
-                     ? (BitWidth % APINT_BITS_PER_WORD)
-                     : APINT_BITS_PER_WORD,
-      excessBits = APINT_BITS_PER_WORD - msw_bits
-    };
-    uint32_t Count = ap_private_ops::CountLeadingZeros_64(pVal[_AP_N - 1]);
-    if (Count >= excessBits) Count -= excessBits;
-    if (!pVal[_AP_N - 1]) {
-      for (int i = _AP_N - 1; i; --i) {
-        if (!pVal[i - 1])
-          Count += APINT_BITS_PER_WORD;
-        else {
-          Count += ap_private_ops::CountLeadingZeros_64(pVal[i - 1]);
-          break;
-        }
-      }
-    }
-    return Count;
-  }
-
-  /// countLeadingOnes - This function counts the number of contiguous 1 bits
-  /// in the high order bits. The count stops when the first 0 bit is reached.
-  /// @returns 0 if the high order bit is not set
-  /// @returns the number of 1 bits from the most significant to the least
-  /// @brief Count the number of leading one bits.
-  INLINE uint32_t countLeadingOnes() const {
-    if (isSingleWord())
-      return countLeadingOnes_64(get_VAL(), APINT_BITS_PER_WORD - BitWidth);
-
-    uint32_t highWordBits = BitWidth % APINT_BITS_PER_WORD;
-    uint32_t shift =
-        (highWordBits == 0 ? 0 : APINT_BITS_PER_WORD - highWordBits);
-    int i = _AP_N - 1;
-    uint32_t Count = countLeadingOnes_64(get_pVal(i), shift);
-    if (Count == highWordBits) {
-      for (i--; i >= 0; --i) {
-        if (get_pVal(i) == ~0ULL)
-          Count += APINT_BITS_PER_WORD;
-        else {
-          Count += countLeadingOnes_64(get_pVal(i), 0);
-          break;
-        }
-      }
-    }
-    return Count;
-  }
-
-  /// countTrailingZeros - This function is an ap_private version of the
-  /// countTrailingZoers_{32,64} functions in MathExtras.h. It counts
-  /// the number of zeros from the least significant bit to the first set bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the least significant bit to the first
-  /// one bit.
-  /// @brief Count the number of trailing zero bits.
-  INLINE uint32_t countTrailingZeros() const {
-    uint32_t Count = 0;
-    uint32_t i = 0;
-    for (; i < _AP_N && get_pVal(i) == 0; ++i) Count += APINT_BITS_PER_WORD;
-    if (i < _AP_N) Count += ap_private_ops::CountTrailingZeros_64(get_pVal(i));
-    return AESL_std::min(Count, BitWidth);
-  }
-  /// countPopulation - This function is an ap_private version of the
-  /// countPopulation_{32,64} functions in MathExtras.h. It counts the number
-  /// of 1 bits in the ap_private value.
-  /// @returns 0 if the value is zero.
-  /// @returns the number of set bits.
-  /// @brief Count the number of bits set.
-  INLINE uint32_t countPopulation() const {
-    uint32_t Count = 0;
-    for (int i = 0; i < _AP_N - 1; ++i)
-      Count += ap_private_ops::CountPopulation_64(pVal[i]);
-    Count += ap_private_ops::CountPopulation_64(pVal[_AP_N - 1] & mask);
-    return Count;
-  }
-
-  /// @}
-  /// @name Conversion Functions
-  /// @
-
-  /// This is used internally to convert an ap_private to a string.
-  /// @brief Converts an ap_private to a std::string
-  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
-
-  /// Considers the ap_private to be unsigned and converts it into a string in
-  /// the
-  /// radix given. The radix can be 2, 8, 10 or 16.
-  /// @returns a character interpretation of the ap_private
-  /// @brief Convert unsigned ap_private to string representation.
-  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
-    return toString(radix, false);
-  }
-
-  /// Considers the ap_private to be unsigned and converts it into a string in
-  /// the
-  /// radix given. The radix can be 2, 8, 10 or 16.
-  /// @returns a character interpretation of the ap_private
-  /// @brief Convert unsigned ap_private to string representation.
-  INLINE std::string toStringSigned(uint8_t radix = 10) const {
-    return toString(radix, true);
-  }
-
-  /// @brief Converts this ap_private to a double value.
-  INLINE double roundToDouble(bool isSigned) const {
-    // Handle the simple case where the value is contained in one uint64_t.
-    if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
-      uint64_t val = pVal[0];
-      if (isSigned) {
-        int64_t sext = ((int64_t(val)) << (64 - BitWidth)) >> (64 - BitWidth);
-        return double(sext);
-      } else
-        return double(val);
-    }
-
-    // Determine if the value is negative.
-    bool isNeg = isSigned ? (*this)[BitWidth - 1] : false;
-
-    // Construct the absolute value if we're negative.
-    ap_private<_AP_W, _AP_S> Tmp(isNeg ? -(*this) : (*this));
-
-    // Figure out how many bits we're using.
-    uint32_t n = Tmp.getActiveBits();
-
-    // The exponent (without bias normalization) is just the number of bits
-    // we are using. Note that the sign bit is gone since we constructed the
-    // absolute value.
-    uint64_t exp = n;
-
-    // Return infinity for exponent overflow
-    if (exp > 1023) {
-      if (!isSigned || !isNeg)
-        return std::numeric_limits<double>::infinity();
-      else
-        return -std::numeric_limits<double>::infinity();
-    }
-    exp += 1023; // Increment for 1023 bias
-
-    // Number of bits in mantissa is 52. To obtain the mantissa value, we must
-    // extract the high 52 bits from the correct words in pVal.
-    uint64_t mantissa;
-    unsigned hiWord = whichWord(n - 1);
-    if (hiWord == 0) {
-      mantissa = Tmp.get_pVal(0);
-      if (n > 52)
-        (mantissa) >>= (n - 52); // shift down, we want the top 52 bits.
-    } else {
-      assert(hiWord > 0 && "High word is negative?");
-      uint64_t hibits = (Tmp.get_pVal(hiWord))
-                        << (52 - n % APINT_BITS_PER_WORD);
-      uint64_t lobits =
-          (Tmp.get_pVal(hiWord - 1)) >> (11 + n % APINT_BITS_PER_WORD);
-      mantissa = hibits | lobits;
-    }
-
-    // The leading bit of mantissa is implicit, so get rid of it.
-    uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0;
-    union {
-      double __D;
-      uint64_t __I;
-    } __T;
-    __T.__I = sign | ((exp) << 52) | mantissa;
-    return __T.__D;
-  }
-
-  /// @brief Converts this unsigned ap_private to a double value.
-  INLINE double roundToDouble() const { return roundToDouble(false); }
-
-  /// @brief Converts this signed ap_private to a double value.
-  INLINE double signedRoundToDouble() const { return roundToDouble(true); }
-
-  /// The conversion does not do a translation from integer to double, it just
-  /// re-interprets the bits as a double. Note that it is valid to do this on
-  /// any bit width. Exactly 64 bits will be translated.
-  /// @brief Converts ap_private bits to a double
-  INLINE double bitsToDouble() const {
-    union {
-      uint64_t __I;
-      double __D;
-    } __T;
-    __T.__I = pVal[0];
-    return __T.__D;
-  }
-
-  /// The conversion does not do a translation from integer to float, it just
-  /// re-interprets the bits as a float. Note that it is valid to do this on
-  /// any bit width. Exactly 32 bits will be translated.
-  /// @brief Converts ap_private bits to a double
-  INLINE float bitsToFloat() const {
-    union {
-      uint32_t __I;
-      float __F;
-    } __T;
-    __T.__I = uint32_t(pVal[0]);
-    return __T.__F;
-  }
-
-  /// The conversion does not do a translation from double to integer, it just
-  /// re-interprets the bits of the double. Note that it is valid to do this on
-  /// any bit width but bits from V may get truncated.
-  /// @brief Converts a double to ap_private bits.
-  INLINE ap_private& doubleToBits(double __V) {
-    union {
-      uint64_t __I;
-      double __D;
-    } __T;
-    __T.__D = __V;
-    pVal[0] = __T.__I;
-    return *this;
-  }
-
-  /// The conversion does not do a translation from float to integer, it just
-  /// re-interprets the bits of the float. Note that it is valid to do this on
-  /// any bit width but bits from V may get truncated.
-  /// @brief Converts a float to ap_private bits.
-  INLINE ap_private& floatToBits(float __V) {
-    union {
-      uint32_t __I;
-      float __F;
-    } __T;
-    __T.__F = __V;
-    pVal[0] = __T.__I;
-  }
-
-  // Reduce operation
-  //-----------------------------------------------------------
-  INLINE bool and_reduce() const { return isMaxValue(); }
-
-  INLINE bool nand_reduce() const { return isMinValue(); }
-
-  INLINE bool or_reduce() const { return (bool)countPopulation(); }
-
-  INLINE bool nor_reduce() const { return countPopulation() == 0; }
-
-  INLINE bool xor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? true : false;
-  }
-
-  INLINE bool xnor_reduce() const {
-    unsigned int i = countPopulation();
-    return (i % 2) ? false : true;
-  }
-  INLINE std::string to_string(uint8_t radix = 16, bool sign = false) const {
-    return toString(radix, radix == 10 ? _AP_S : sign);
-  }
-}; // End of class ap_private <_AP_W, _AP_S, false>
-
-namespace ap_private_ops {
-
-enum { APINT_BITS_PER_WORD = 64 };
-template <int _AP_W, bool _AP_S>
-INLINE bool operator==(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
-  return V2 == V1;
-}
-
-template <int _AP_W, bool _AP_S>
-INLINE bool operator!=(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
-  return V2 != V1;
-}
-
-template <int _AP_W, bool _AP_S, int index>
-INLINE bool get(const ap_private<_AP_W, _AP_S>& a) {
-  static const uint64_t mask = 1ULL << (index & 0x3f);
-  return ((mask & a.get_pVal((index) >> 6)) != 0);
-}
-
-template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
-INLINE void set(ap_private<_AP_W, _AP_S>& a,
-                const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
-                const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
-  enum {
-    APINT_BITS_PER_WORD = 64,
-    lsb_word = lsb_index / APINT_BITS_PER_WORD,
-    msb_word = msb_index / APINT_BITS_PER_WORD,
-    msb = msb_index % APINT_BITS_PER_WORD,
-    lsb = lsb_index % APINT_BITS_PER_WORD
-  };
-  if (msb_word == lsb_word) {
-    const uint64_t mask = ~0ULL >>
-                          (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
-                          (APINT_BITS_PER_WORD - msb - 1);
-    // a.set_pVal(msb_word, a.get_pVal(msb_word)  | mask);
-    a.get_pVal(msb_word) |= mask;
-  } else {
-    const uint64_t lsb_mask = ~0ULL >> (lsb) << (lsb);
-    const uint64_t msb_mask = ~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
-                              (APINT_BITS_PER_WORD - msb - 1);
-    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) | lsb_mask);
-    a.get_pVal(lsb_word) |= lsb_mask;
-    for (int i = lsb_word + 1; i < msb_word; i++) {
-      a.set_pVal(i, ~0ULL);
-      // a.get_pVal(i)=0;
-    }
-    // a.set_pVal(msb_word, a.get_pVal(msb_word) | msb_mask);
-
-    a.get_pVal(msb_word) |= msb_mask;
-  }
-  a.clearUnusedBits();
-}
-
-template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
-INLINE void clear(ap_private<_AP_W, _AP_S>& a,
-                  const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
-                  const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
-  enum {
-    APINT_BITS_PER_WORD = 64,
-    lsb_word = lsb_index / APINT_BITS_PER_WORD,
-    msb_word = msb_index / APINT_BITS_PER_WORD,
-    msb = msb_index % APINT_BITS_PER_WORD,
-    lsb = lsb_index % APINT_BITS_PER_WORD
-  };
-  if (msb_word == lsb_word) {
-    const uint64_t mask =
-        ~(~0ULL >> (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
-          (APINT_BITS_PER_WORD - msb - 1));
-    // a.set_pVal(msb_word, a.get_pVal(msb_word) & mask);
-    a.get_pVal(msb_word) &= mask;
-  } else {
-    const uint64_t lsb_mask = ~(~0ULL >> (lsb) << (lsb));
-    const uint64_t msb_mask = ~(~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
-                                (APINT_BITS_PER_WORD - msb - 1));
-    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) & lsb_mask);
-    a.get_pVal(lsb_word) &= lsb_mask;
-    for (int i = lsb_word + 1; i < msb_word; i++) {
-      // a.set_pVal(i, 0);
-      a.get_pVal(i) = 0;
-    }
-    // a.set_pVal(msb_word, a.get_pVal(msb_word) & msb_mask);
-    a.get_pVal(msb_word) &= msb_mask;
-  }
-  a.clearUnusedBits();
-}
-
-template <int _AP_W, bool _AP_S, int index>
-INLINE void set(ap_private<_AP_W, _AP_S>& a,
-                const ap_private<AP_MAX(index, 1), true>& mark = 0) {
-  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
-  static const uint64_t mask = 1ULL << (index % APINT_BITS_PER_WORD);
-  // a.set_pVal(word, a.get_pVal(word) | mask);
-  a.get_pVal(word) |= mask;
-  a.clearUnusedBits();
-}
-
-template <int _AP_W, bool _AP_S, int index>
-INLINE void clear(ap_private<_AP_W, _AP_S>& a,
-                  const ap_private<AP_MAX(index, 1), true>& mark = 0) {
-  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
-  static const uint64_t mask = ~(1ULL << (index % APINT_BITS_PER_WORD));
-  // a.set_pVal(word, a.get_pVal(word) & mask);
-  a.get_pVal(word) &= mask;
-  a.clearUnusedBits();
-}
-
-} // End of ap_private_ops namespace
-
-template <int _AP_W, bool _AP_S>
-INLINE std::string ap_private<_AP_W, _AP_S, false>::toString(
-    uint8_t radix, bool wantSigned) const {
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
-         "Radix should be 2, 8, 10, or 16!");
-  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
-                                 "8", "9", "A", "B", "C", "D", "E", "F"};
-  std::string result;
-
-  if (radix != 10) {
-    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
-    // because the number of bits per digit (1,3 and 4 respectively) divides
-    // equaly. We just shift until there value is zero.
-
-    // First, check for a zero value and just short circuit the logic below.
-    if (*this == (uint64_t)(0))
-      result = "0";
-    else {
-      ap_private<_AP_W, false> tmp(*this);
-      size_t insert_at = 0;
-      bool leading_zero = true;
-      if (wantSigned && isNegative()) {
-        // They want to print the signed version and it is a negative value
-        // Flip the bits and add one to turn it into the equivalent positive
-        // value and put a '-' in the result.
-        tmp.flip();
-        tmp++;
-        tmp.clearUnusedBitsToZero();
-        result = "-";
-        insert_at = 1;
-        leading_zero = false;
-      }
-      switch (radix) {
-        case 2:
-          result += "0b";
-          break;
-        case 8:
-          result += "0o";
-          break;
-        case 16:
-          result += "0x";
-          break;
-        default:
-          assert("invalid radix" && 0);
-      }
-      insert_at += 2;
-      // Just shift tmp right for each digit width until it becomes zero
-      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
-      uint64_t mask = radix - 1;
-      ap_private<_AP_W, false> zero(0);
-      unsigned bits = 0;
-      while (tmp.ne(zero)) {
-        uint64_t digit = tmp.get_VAL() & mask;
-        result.insert(insert_at, digits[digit]);
-        tmp = tmp.lshr(shift);
-        ++bits;
-      }
-      bits *= shift;
-      if (bits < _AP_W && leading_zero) result.insert(insert_at, digits[0]);
-    }
-    return result;
-  }
-
-  ap_private<_AP_W, false> tmp(*this);
-  ap_private<_AP_W, false> divisor(radix);
-  ap_private<_AP_W, false> zero(0);
-  size_t insert_at = 0;
-  if (wantSigned && isNegative()) {
-    // They want to print the signed version and it is a negative value
-    // Flip the bits and add one to turn it into the equivalent positive
-    // value and put a '-' in the result.
-    tmp.flip();
-    tmp++;
-    tmp.clearUnusedBitsToZero();
-    result = "-";
-    insert_at = 1;
-  }
-  if (tmp == ap_private<_AP_W, false>(0))
-    result = "0";
-  else
-    while (tmp.ne(zero)) {
-      ap_private<_AP_W, false> APdigit(0);
-      ap_private<_AP_W, false> tmp2(0);
-      ap_private_ops::divide(tmp, tmp.getNumWords(), divisor,
-                             divisor.getNumWords(), &tmp2, &APdigit);
-      uint64_t digit = APdigit.getZExtValue();
-      assert(digit < radix && "divide failed");
-      result.insert(insert_at, digits[digit]);
-      tmp = tmp2;
-    }
-
-  return result;
-} // End of ap_private<_AP_W, _AP_S, false>::toString()
-
-template <int _AP_W, bool _AP_S>
-std::ostream &operator<<(std::ostream &os, const ap_private<_AP_W, _AP_S> &x) {
-  std::ios_base::fmtflags ff = std::cout.flags();
-  if (ff & std::cout.hex) {
-    os << x.toString(16, false); // don't print sign
-  } else if (ff & std::cout.oct) {
-    os << x.toString(8, false); // don't print sign
-  } else {
-    os << x.toString(10, _AP_S);
-  }
-  return os;
-}
-
-// ------------------------------------------------------------ //
-//           XXX moved here from ap_int_sim.h  XXX              //
-// ------------------------------------------------------------ //
-
-/// Concatination reference.
-/// Proxy class which allows concatination to be used as rvalue(for reading) and
-/// lvalue(for writing)
-// ----------------------------------------------------------------
-// template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
-// struct ap_concat_ref {
-//#ifdef _MSC_VER
-//#pragma warning(disable : 4521 4522)
-//#endif
-//  enum {
-//    _AP_WR = _AP_W1 + _AP_W2,
-//  };
-//  _AP_T1& mbv1;
-//  _AP_T2& mbv2;
-//
-//  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>&
-//  ref)
-//      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
-//
-//  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref& operator=(const ap_private<_AP_W3, _AP_S3>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
-//    int W_ref1 = mbv1.length();
-//    int W_ref2 = mbv2.length();
-//    ap_private<_AP_W1, false> mask1(-1);
-//    mask1 >>= _AP_W1 - W_ref1;
-//    ap_private<_AP_W2, false> mask2(-1);
-//    mask2 >>= _AP_W2 - W_ref2;
-//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
-//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
-//    return *this;
-//  }
-//
-//  INLINE ap_concat_ref& operator=(unsigned long long val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-//  INLINE ap_concat_ref& operator=(
-//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  INLINE ap_concat_ref& operator=(
-//      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref& operator=(const _private_bit_ref<_AP_W3, _AP_S3>&
-//  val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref& operator=(const _private_range_ref<_AP_W3, _AP_S3>&
-//  val) {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref& operator=(
-//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val)
-//      {
-//    return operator=((const ap_private<_AP_W3, false>)(val));
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref& operator=(
-//      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
-//          val) {
-//    return operator=(val.to_ap_private());
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref& operator=(
-//      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
-//    return operator=((unsigned long long)(bool)(val));
-//  }
-//
-//  INLINE operator ap_private<_AP_WR, false>() const { return get(); }
-//
-//  INLINE operator unsigned long long() const { return get().to_uint64(); }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                       _private_range_ref<_AP_W3, _AP_S3> >
-//  operator,(const _private_range_ref<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                         _private_range_ref<_AP_W3, _AP_S3> >(
-//        *this, const_cast<_private_range_ref<_AP_W3, _AP_S3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE
-//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
-//      >
-//      operator,(ap_private<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                         ap_private<_AP_W3, _AP_S3> >(*this, a2);
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE
-//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
-//      >
-//      operator,(const ap_private<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
-//                         ap_private<_AP_W3, _AP_S3> >(
-//        *this, const_cast<ap_private<_AP_W3, _AP_S3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
-//  _AP_S3> >
-//  operator,(const _private_bit_ref<_AP_W3, _AP_S3> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
-//    _AP_S3> >(
-//        *this, const_cast<_private_bit_ref<_AP_W3, _AP_S3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-//                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
-//  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
-//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
-//                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
-//        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4,
-//        _AP_T4>&>(a2));
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE ap_concat_ref<
-//      _AP_WR, ap_concat_ref, _AP_W3,
-//      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
-//  operator,(
-//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2)
-//      {
-//    return ap_concat_ref<
-//        _AP_WR, ap_concat_ref, _AP_W3,
-//        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-//        *this,
-//        const_cast<
-//            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
-//            _AP_N3>&>(a2));
-//  }
-//
-//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
-//            ap_o_mode _AP_O3, int _AP_N3>
-//  INLINE
-//      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
-//                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
-//                    >
-//      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
-//      _AP_N3>
-//                    &a2) {
-//    return ap_concat_ref<
-//        _AP_WR, ap_concat_ref, 1,
-//        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
-//        *this,
-//        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
-//        _AP_N3>&>(
-//            a2));
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
-//      const ap_private<_AP_W3, _AP_S3>& a2) {
-//    return get() & a2;
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
-//      const ap_private<_AP_W3, _AP_S3>& a2) {
-//    return get() | a2;
-//  }
-//
-//  template <int _AP_W3, bool _AP_S3>
-//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
-//      const ap_private<_AP_W3, _AP_S3>& a2) {
-//    return ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3>(get() ^ a2);
-//  }
-//
-//  INLINE const ap_private<_AP_WR, false> get() const {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
-//    int W_ref2 = mbv2.length();
-//    tmpVal <<= W_ref2;
-//    tmpVal |= tmpVal2;
-//    return tmpVal;
-//  }
-//
-//  INLINE const ap_private<_AP_WR, false> get() {
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
-//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
-//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
-//    int W_ref2 = mbv2.length();
-//    tmpVal <<= W_ref2;
-//    tmpVal |= tmpVal2;
-//    return tmpVal;
-//  }
-//
-//  template <int _AP_W3>
-//  INLINE void set(const ap_private<_AP_W3, false>& val) {
-//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
-//    int W_ref1 = mbv1.length();
-//    int W_ref2 = mbv2.length();
-//    ap_private<_AP_W1, false> mask1(-1);
-//    mask1 >>= _AP_W1 - W_ref1;
-//    ap_private<_AP_W2, false> mask2(-1);
-//    mask2 >>= _AP_W2 - W_ref2;
-//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
-//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
-//  }
-//
-//  INLINE int length() const { return mbv1.length() + mbv2.length(); }
-//
-//  INLINE std::string to_string(uint8_t radix = 2) const {
-//    return get().to_string(radix);
-//  }
-//}; // struct ap_concat_ref.
-
-/// Range(slice) reference
-/// Proxy class, which allows part selection to be used as rvalue(for reading)
-/// and lvalue(for writing)
-//------------------------------------------------------------
-template <int _AP_W, bool _AP_S>
-struct _private_range_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  ap_private<_AP_W, _AP_S>& d_bv;
-  int l_index;
-  int h_index;
-
- public:
-  /// copy ctor.
-  INLINE _private_range_ref(const _private_range_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
-
-  /// direct ctor.
-  INLINE _private_range_ref(ap_private<_AP_W, _AP_S>* bv, int h, int l)
-      : d_bv(*bv), l_index(l), h_index(h) {
-    _AP_WARNING(h < 0 || l < 0,
-                "Higher bound (%d) and lower bound (%d) cannot be "
-                "negative.",
-                h, l);
-    _AP_WARNING(h >= _AP_W || l >= _AP_W,
-                "Higher bound (%d) or lower bound (%d) out of range (%d).", h, l,
-                _AP_W);
-  }
-
-  /// compound or assignment.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
-      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
-                "Bitsize mismach for ap_private<>.range() &= "
-                "ap_private<>.range().");
-    this->d_bv |= ref.d_bv;
-    return *this;
-  }
-
-  /// compound or assignment with root type.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
-      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
-                "Bitsize mismach for ap_private<>.range() |= _AP_ROOT_TYPE<>.");
-    this->d_bv |= ref.V;
-    return *this;
-  }
-
-  /// compound and assignment.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
-      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
-                "Bitsize mismach for ap_private<>.range() &= "
-                "ap_private<>.range().");
-    this->d_bv &= ref.d_bv;
-    return *this;
-  };
-
-  /// compound and assignment with root type.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
-      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
-                "Bitsize mismach for ap_private<>.range() &= _AP_ROOT_TYPE<>.");
-    this->d_bv &= ref.V;
-    return *this;
-  }
-
-  /// compound xor assignment.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
-      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
-                "Bitsize mismach for ap_private<>.range() ^= "
-                "ap_private<>.range().");
-    this->d_bv ^= ref.d_bv;
-    return *this;
-  };
-
-  /// compound xor assignment with root type.
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
-      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
-    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
-                "Bitsize mismach for ap_private<>.range() ^= _AP_ROOT_TYPE<>.");
-    this->d_bv ^= ref.V;
-    return *this;
-  }
-
-  /// @name convertors.
-  //  @{
-  INLINE operator ap_private<_AP_W, false>() const {
-    ap_private<_AP_W, false> val(0);
-    if (h_index >= l_index) {
-      if (_AP_W > 64) {
-        val = d_bv;
-        ap_private<_AP_W, false> mask(-1);
-        mask >>= _AP_W - (h_index - l_index + 1);
-        val >>= l_index;
-        val &= mask;
-      } else {
-        const static uint64_t mask = (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
-        val = (d_bv >> l_index) & (mask >> (_AP_W - (h_index - l_index + 1)));
-      }
-    } else {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        if ((d_bv)[j]) val.set(i);
-    }
-    return val;
-  }
-
-  INLINE operator unsigned long long() const { return to_uint64(); }
-  //  @}
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref& operator=(const ap_private<_AP_W2, _AP_S2>& val) {
-    ap_private<_AP_W, false> vval = ap_private<_AP_W, false>(val);
-    if (l_index > h_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
-    } else {
-      if (_AP_W > 64) {
-        ap_private<_AP_W, false> mask(-1);
-        if (l_index > 0) {
-          mask <<= l_index;
-          vval <<= l_index;
-        }
-        if (h_index < _AP_W - 1) {
-          ap_private<_AP_W, false> mask2(-1);
-          mask2 >>= _AP_W - h_index - 1;
-          mask &= mask2;
-          vval &= mask2;
-        }
-        mask.flip();
-        d_bv &= mask;
-        d_bv |= vval;
-      } else {
-        unsigned shift = 64 - _AP_W;
-        uint64_t mask = ~0ULL >> (shift);
-        if (l_index > 0) {
-          vval = mask & vval << l_index;
-          mask = mask & mask << l_index;
-        }
-        if (h_index < _AP_W - 1) {
-          uint64_t mask2 = mask;
-          mask2 >>= (_AP_W - h_index - 1);
-          mask &= mask2;
-          vval &= mask2;
-        }
-        mask = ~mask;
-        d_bv &= mask;
-        d_bv |= vval;
-      }
-    }
-    return *this;
-  } // operator=(const ap_private<>&)
-
-  INLINE _private_range_ref& operator=(unsigned long long val) {
-    const ap_private<_AP_W, _AP_S> vval = val;
-    return operator=(vval);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref& operator=(
-      const _private_bit_ref<_AP_W2, _AP_S2>& val) {
-    return operator=((unsigned long long)(bool)val);
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE _private_range_ref& operator=(
-      const _private_range_ref<_AP_W2, _AP_S2>& val) {
-    const ap_private<_AP_W, false> tmpVal(val);
-    return operator=(tmpVal);
-  }
-
-//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
-//  INLINE _private_range_ref& operator=(
-//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
-//    const ap_private<_AP_W, false> tmpVal(val);
-//    return operator=(tmpVal);
-//  }
-
-  // TODO from ap_int_base, ap_bit_ref and ap_range_ref.
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE _private_range_ref& operator=(
-      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(val.to_ap_int_base().V);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE _private_range_ref& operator=(
-      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=(val.operator ap_int_base<_AP_W2, false>().V);
-  }
-
-  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-            ap_o_mode _AP_O2, int _AP_N2>
-  INLINE _private_range_ref& operator=(
-      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
-    return operator=((unsigned long long)(bool)val);
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                       _private_range_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                         _private_range_ref<_AP_W2, _AP_S2> >(
-//        *this, const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                       ap_private<_AP_W2, _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
-//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
-//  }
-//
-//  INLINE
-//  ap_concat_ref<_AP_W, _private_range_ref, _AP_W, ap_private<_AP_W, _AP_S> >
-//  operator,(ap_private<_AP_W, _AP_S>& a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W,
-//                         ap_private<_AP_W, _AP_S> >(*this, a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, 1,
-//                       _private_bit_ref<_AP_W2, _AP_S2> >
-//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, 1,
-//                         _private_bit_ref<_AP_W2, _AP_S2> >(
-//        *this, const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
-//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      _AP_W, _private_range_ref, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(
-//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
-//    return ap_concat_ref<
-//        _AP_W, _private_range_ref, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        *this,
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<_AP_W, _private_range_ref, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
-//                    &a2) {
-//    return ap_concat_ref<
-//        _AP_W, _private_range_ref, 1,
-//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        *this,
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
-//            a2));
-//  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs == rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs != rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs > rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator>=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs >= rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs < rhs;
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator<=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
-    ap_private<_AP_W, false> lhs = get();
-    ap_private<_AP_W2, false> rhs = op2.get();
-    return lhs <= rhs;
-  }
-
-  template <int _AP_W2>
-  INLINE void set(const ap_private<_AP_W2, false>& val) {
-    ap_private<_AP_W, _AP_S> vval = val;
-    if (l_index > h_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
-    } else {
-      if (_AP_W > 64) {
-        ap_private<_AP_W, _AP_S> mask(-1);
-        if (l_index > 0) {
-          ap_private<_AP_W, false> mask1(-1);
-          mask1 >>= _AP_W - l_index;
-          mask1.flip();
-          mask = mask1;
-          // vval&=mask1;
-          vval <<= l_index;
-        }
-        if (h_index < _AP_W - 1) {
-          ap_private<_AP_W, false> mask2(-1);
-          mask2 <<= h_index + 1;
-          mask2.flip();
-          mask &= mask2;
-          vval &= mask2;
-        }
-        mask.flip();
-        d_bv &= mask;
-        d_bv |= vval;
-      } else {
-        uint64_t mask = ~0ULL >> (64 - _AP_W);
-        if (l_index > 0) {
-          uint64_t mask1 = mask;
-          mask1 = mask & (mask1 >> (_AP_W - l_index));
-          vval = mask & (vval << l_index);
-          mask = ~mask1 & mask;
-          // vval&=mask1;
-        }
-        if (h_index < _AP_W - 1) {
-          uint64_t mask2 = ~0ULL >> (64 - _AP_W);
-          mask2 = mask & (mask2 << (h_index + 1));
-          mask &= ~mask2;
-          vval &= ~mask2;
-        }
-        d_bv &= (~mask & (~0ULL >> (64 - _AP_W)));
-        d_bv |= vval;
-      }
-    }
-  }
-
-  INLINE ap_private<_AP_W, false> get() const {
-    ap_private<_AP_W, false> val(0);
-    if (h_index < l_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        if ((d_bv)[j]) val.set(i);
-    } else {
-      val = d_bv;
-      val >>= l_index;
-      if (h_index < _AP_W - 1) {
-        if (_AP_W <= 64) {
-          const static uint64_t mask =
-              (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
-          val &= (mask >> (_AP_W - (h_index - l_index + 1)));
-        } else {
-          ap_private<_AP_W, false> mask(-1);
-          mask >>= _AP_W - (h_index - l_index + 1);
-          val &= mask;
-        }
-      }
-    }
-    return val;
-  }
-
-  INLINE ap_private<_AP_W, false> get() {
-    ap_private<_AP_W, false> val(0);
-    if (h_index < l_index) {
-      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
-        if ((d_bv)[j]) val.set(i);
-    } else {
-      val = d_bv;
-      val >>= l_index;
-      if (h_index < _AP_W - 1) {
-        if (_AP_W <= 64) {
-          static const uint64_t mask = ~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0);
-          return val &= ((mask) >> (_AP_W - (h_index - l_index + 1)));
-        } else {
-          ap_private<_AP_W, false> mask(-1);
-          mask >>= _AP_W - (h_index - l_index + 1);
-          val &= mask;
-        }
-      }
-    }
-    return val;
-  }
-
-  INLINE int length() const {
-    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
-  }
-
-  INLINE int to_int() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_int();
-  }
-
-  INLINE unsigned int to_uint() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_uint();
-  }
-
-  INLINE long to_long() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_long();
-  }
-
-  INLINE unsigned long to_ulong() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_ulong();
-  }
-
-  INLINE ap_slong to_int64() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_int64();
-  }
-
-  INLINE ap_ulong to_uint64() const {
-    ap_private<_AP_W, false> val = get();
-    return val.to_uint64();
-  }
-
-  INLINE std::string to_string(uint8_t radix = 2) const {
-    return get().to_string(radix);
-  }
-
-  INLINE bool and_reduce() {
-    bool ret = true;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) ret &= d_bv[i];
-    return ret;
-  }
-
-  INLINE bool or_reduce() {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) ret |= d_bv[i];
-    return ret;
-  }
-
-  INLINE bool xor_reduce() {
-    bool ret = false;
-    bool reverse = l_index > h_index;
-    unsigned low = reverse ? h_index : l_index;
-    unsigned high = reverse ? l_index : h_index;
-    for (unsigned i = low; i != high; ++i) ret ^= d_bv[i];
-    return ret;
-  }
-}; // struct _private_range_ref.
-
-/// Bit reference
-/// Proxy class, which allows bit selection to be used as rvalue(for reading)
-/// and lvalue(for writing)
-//--------------------------------------------------------------
-template <int _AP_W, bool _AP_S>
-struct _private_bit_ref {
-#ifdef _MSC_VER
-#pragma warning(disable : 4521 4522)
-#endif
-  ap_private<_AP_W, _AP_S>& d_bv;
-  int d_index;
-
- public:
-  // copy ctor.
-  INLINE _private_bit_ref(const _private_bit_ref<_AP_W, _AP_S>& ref)
-      : d_bv(ref.d_bv), d_index(ref.d_index) {}
-
-  // director ctor.
-  INLINE _private_bit_ref(ap_private<_AP_W, _AP_S>& bv, int index = 0)
-      : d_bv(bv), d_index(index) {
-    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.\n",
-                d_index);
-    _AP_WARNING(d_index >= _AP_W,
-                "Index of bit vector (%d) out of range (%d).\n", d_index, _AP_W);
-  }
-
-  INLINE operator bool() const { return d_bv.get_bit(d_index); }
-
-  INLINE bool to_bool() const { return operator bool(); }
-
-  template <typename T>
-  INLINE _private_bit_ref& operator=(const T& val) {
-    if (!!val)
-      d_bv.set(d_index);
-    else
-      d_bv.clear(d_index);
-    return *this;
-  }
-
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
-//  _AP_S2> >
-//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
-//    _AP_S2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), a2);
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2,
-//  _private_range_ref<_AP_W2,
-//  _AP_S2> >
-//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, _AP_W2,
-//    _private_range_ref<_AP_W2,
-//    _AP_S2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, bool _AP_S2>
-//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref<_AP_W2,
-//  _AP_S2> > operator,(
-//      const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, 1,
-//    _private_bit_ref<_AP_W2, _AP_S2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
-//  }
-//
-//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>
-//  operator,(
-//      const _private_bit_ref &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<_private_bit_ref&>(a2));
-//  }
-//
-//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
-//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
-//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
-//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
-//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE ap_concat_ref<
-//      1, _private_bit_ref, _AP_W2,
-//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
-//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//  _AP_N2>
-//                &a2) const {
-//    return ap_concat_ref<
-//        1, _private_bit_ref, _AP_W2,
-//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<
-//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//            _AP_N2>&>(a2));
-//  }
-//
-//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
-//            ap_o_mode _AP_O2, int _AP_N2>
-//  INLINE
-//      ap_concat_ref<1, _private_bit_ref, 1,
-//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//                    _AP_N2> >
-//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//      _AP_N2>
-//                    &a2) const {
-//    return ap_concat_ref<1, _private_bit_ref, 1, af_bit_ref<_AP_W2,
-//    _AP_I2, _AP_S2,
-//                                                      _AP_Q2, _AP_O2,
-//                                                      _AP_N2> >(
-//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
-//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
-//        _AP_N2>&>(
-//            a2));
-//  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator==(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
-    return get() == op.get();
-  }
-
-  template <int _AP_W2, bool _AP_S2>
-  INLINE bool operator!=(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
-    return get() != op.get();
-  }
-
-  INLINE bool get() const { return operator bool(); }
-
-  //  template <int _AP_W3>
-  //  INLINE void set(const ap_private<_AP_W3, false>& val) {
-  //    operator=(val);
-  //  }
-
-  //  INLINE bool operator~() const {
-  //    bool bit = (d_bv)[d_index];
-  //    return bit ? false : true;
-  //  }
-
-  INLINE int length() const { return 1; }
-
-  //  INLINE std::string to_string() const {
-  //    bool val = get();
-  //    return val ? "1" : "0";
-  //  }
-
-}; // struct _private_bit_ref.
-
-// char a[100];
-// char* ptr = a;
-// ap_int<2> n = 3;
-// char* ptr2 = ptr + n*2;
-// avoid ambiguous errors
-#define OP_BIN_MIX_PTR(BIN_OP)                                           \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
-  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                       \
-                                   const ap_private<_AP_W, _AP_S>& op) { \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
-    return i_op BIN_OP op2;                                              \
-  }                                                                      \
-  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
-  INLINE PTR_TYPE* operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
-                                   PTR_TYPE* i_op) {                     \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
-    return op2 BIN_OP i_op;                                              \
-  }
-
-OP_BIN_MIX_PTR(+)
-OP_BIN_MIX_PTR(-)
-#undef OP_BIN_MIX_PTR
-
-// float OP ap_int
-// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
-#define OP_BIN_MIX_FLOAT(BIN_OP, C_TYPE)                              \
-  template <int _AP_W, bool _AP_S>                                    \
-  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                          \
-                                const ap_private<_AP_W, _AP_S>& op) { \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
-    return i_op BIN_OP op2;                                           \
-  }                                                                   \
-  template <int _AP_W, bool _AP_S>                                    \
-  INLINE C_TYPE operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
-                                C_TYPE i_op) {                        \
-    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
-    return op2 BIN_OP i_op;                                           \
-  }
-
-#define OPS_MIX_FLOAT(C_TYPE) \
-  OP_BIN_MIX_FLOAT(*, C_TYPE) \
-  OP_BIN_MIX_FLOAT(/, C_TYPE) \
-  OP_BIN_MIX_FLOAT(+, C_TYPE) \
-  OP_BIN_MIX_FLOAT(-, C_TYPE)
-
-OPS_MIX_FLOAT(float)
-OPS_MIX_FLOAT(double)
-#undef OP_BIN_MIX_FLOAT
-#undef OPS_MIX_FLOAT
-
-/// Operators mixing Integers with AP_Int
-// ----------------------------------------------------------------
-
-// partially specialize template argument _AP_C in order that:
-// for _AP_W > 64, we will explicitly convert operand with native data type
-// into corresponding ap_private
-// for _AP_W <= 64, we will implicitly convert operand with ap_private into
-// (unsigned) long long
-#define OP_BIN_MIX_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                  \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_WI, _AP_SI>::template RType<_AP_W, _AP_S>::RTYPE \
-      operator BIN_OP(C_TYPE i_op, const ap_private<_AP_W, _AP_S>& op) {       \
-    return ap_private<_AP_WI, _AP_SI>(i_op).operator BIN_OP(op);               \
-  }                                                                            \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
-      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
-    return op.operator BIN_OP(ap_private<_AP_WI, _AP_SI>(i_op));               \
-  }
-
-#define OP_REL_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                     \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(const ap_private<_AP_W, _AP_S>& op,          \
-                              C_TYPE op2) {                                \
-    return op.operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));            \
-  }                                                                        \
-  template <int _AP_W, bool _AP_S>                                         \
-  INLINE bool operator REL_OP(C_TYPE op2,                                  \
-                              const ap_private<_AP_W, _AP_S, false>& op) { \
-    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(op);            \
-  }
-
-#define OP_ASSIGN_MIX_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
-  template <int _AP_W, bool _AP_S>                                 \
-  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(             \
-      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
-    return op.operator ASSIGN_OP(ap_private<_AP_W2, _AP_S2>(op2)); \
-  }
-
-#define OP_BIN_SHIFT_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                \
-  template <int _AP_W, bool _AP_S>                                             \
-  C_TYPE operator BIN_OP(C_TYPE i_op,                                          \
-                         const ap_private<_AP_W, _AP_S, false>& op) {          \
-    return i_op BIN_OP(op.get_VAL());                                          \
-  }                                                                            \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
-      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
-    return op.operator BIN_OP(i_op);                                           \
-  }
-
-#define OP_ASSIGN_RSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
-  template <int _AP_W, bool _AP_S>                              \
-  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
-      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
-    op = op.operator>>(op2);                                    \
-    return op;                                                  \
-  }
-
-#define OP_ASSIGN_LSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
-  template <int _AP_W, bool _AP_S>                              \
-  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
-      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
-    op = op.operator<<(op2);                                    \
-    return op;                                                  \
-  }
-
-#define OPS_MIX_INT(C_TYPE, _AP_W2, _AP_S2)              \
-  OP_BIN_MIX_INT(*, C_TYPE, (_AP_W2), (_AP_S2), mult)    \
-  OP_BIN_MIX_INT(+, C_TYPE, (_AP_W2), (_AP_S2), plus)    \
-  OP_BIN_MIX_INT(-, C_TYPE, (_AP_W2), (_AP_S2), minus)   \
-  OP_BIN_MIX_INT(/, C_TYPE, (_AP_W2), (_AP_S2), div)     \
-  OP_BIN_MIX_INT(%, C_TYPE, (_AP_W2), (_AP_S2), mod)     \
-  OP_BIN_MIX_INT(&, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
-  OP_BIN_MIX_INT(|, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
-  OP_BIN_MIX_INT (^, C_TYPE, (_AP_W2), (_AP_S2), logic)  \
-  OP_BIN_SHIFT_INT(>>, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
-  OP_BIN_SHIFT_INT(<<, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
-                                                         \
-  OP_ASSIGN_MIX_INT(+=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(-=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(*=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(/=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(%=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(&=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(|=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_MIX_INT(^=, C_TYPE, (_AP_W2), (_AP_S2))      \
-  OP_ASSIGN_RSHIFT_INT(>>=, C_TYPE, (_AP_W2), (_AP_S2))  \
-  OP_ASSIGN_LSHIFT_INT(<<=, C_TYPE, (_AP_W2), (_AP_S2))  \
-                                                         \
-  OP_REL_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))          \
-  OP_REL_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))          \
-  OP_REL_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2))         \
-  OP_REL_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2))         \
-  OP_REL_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2))         \
-  OP_REL_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
-
-OPS_MIX_INT(bool, 1, false)
-OPS_MIX_INT(char, 8, CHAR_IS_SIGNED)
-OPS_MIX_INT(signed char, 8, true)
-OPS_MIX_INT(unsigned char, 8, false)
-OPS_MIX_INT(short, sizeof(short) * 8, true)
-OPS_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
-OPS_MIX_INT(int, sizeof(int) * 8, true)
-OPS_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
-OPS_MIX_INT(long, sizeof(long) * 8, true)
-OPS_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
-OPS_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
-OPS_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-
-#undef OP_BIN_MIX_INT
-#undef OP_BIN_SHIFT_INT
-#undef OP_ASSIGN_MIX_INT
-#undef OP_ASSIGN_RSHIFT_INT
-#undef OP_ASSIGN_LSHIFT_INT
-#undef OP_REL_MIX_INT
-#undef OPS_MIX_INT
-
-#define OP_BIN_MIX_RANGE(BIN_OP, RTYPE)                                     \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
-  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                             _AP_S2>::RTYPE \
-  operator BIN_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,            \
-                  const ap_private<_AP_W2, _AP_S2>& op2) {                  \
-    return ap_private<_AP_W1, false>(op1).operator BIN_OP(op2);             \
-  }                                                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
-  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
-                                                             _AP_S2>::RTYPE \
-  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                    \
-                  const _private_range_ref<_AP_W2, _AP_S2>& op2) {          \
-    return op1.operator BIN_OP(ap_private<_AP_W2, false>(op2));             \
-  }
-
-#define OP_ASSIGN_MIX_RANGE(ASSIGN_OP)                             \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
-  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(           \
-      ap_private<_AP_W1, _AP_S1>& op1,                             \
-      const _private_range_ref<_AP_W2, _AP_S2>& op2) {             \
-    return op1.operator ASSIGN_OP(ap_private<_AP_W2, false>(op2)); \
-  }                                                                \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
-  INLINE _private_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(   \
-      _private_range_ref<_AP_W1, _AP_S1>& op1,                     \
-      ap_private<_AP_W2, _AP_S2>& op2) {                           \
-    ap_private<_AP_W1, false> tmp(op1);                            \
-    tmp.operator ASSIGN_OP(op2);                                   \
-    op1 = tmp;                                                     \
-    return op1;                                                    \
-  }
-
-#define OP_REL_MIX_RANGE(REL_OP)                                               \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE bool operator REL_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,   \
-                              const ap_private<_AP_W2, _AP_S2>& op2) {         \
-    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);                \
-  }                                                                            \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
-  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,           \
-                              const _private_range_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1.operator REL_OP(op2.operator ap_private<_AP_W2, false>());      \
-  }
-
-OP_BIN_MIX_RANGE(+, plus)
-OP_BIN_MIX_RANGE(-, minus)
-OP_BIN_MIX_RANGE(*, mult)
-OP_BIN_MIX_RANGE(/, div)
-OP_BIN_MIX_RANGE(%, mod)
-OP_BIN_MIX_RANGE(&, logic)
-OP_BIN_MIX_RANGE(|, logic)
-OP_BIN_MIX_RANGE(^, logic)
-OP_BIN_MIX_RANGE(>>, arg1)
-OP_BIN_MIX_RANGE(<<, arg1)
-#undef OP_BIN_MIX_RANGE
-
-OP_ASSIGN_MIX_RANGE(+=)
-OP_ASSIGN_MIX_RANGE(-=)
-OP_ASSIGN_MIX_RANGE(*=)
-OP_ASSIGN_MIX_RANGE(/=)
-OP_ASSIGN_MIX_RANGE(%=)
-OP_ASSIGN_MIX_RANGE(&=)
-OP_ASSIGN_MIX_RANGE(|=)
-OP_ASSIGN_MIX_RANGE(^=)
-OP_ASSIGN_MIX_RANGE(>>=)
-OP_ASSIGN_MIX_RANGE(<<=)
-#undef OP_ASSIGN_MIX_RANGE
-
-OP_REL_MIX_RANGE(>)
-OP_REL_MIX_RANGE(<)
-OP_REL_MIX_RANGE(>=)
-OP_REL_MIX_RANGE(<=)
-OP_REL_MIX_RANGE(==)
-OP_REL_MIX_RANGE(!=)
-#undef OP_REL_MIX_RANGE
-
-#define OP_BIN_MIX_BIT(BIN_OP, RTYPE)                                         \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
-  INLINE typename ap_private<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
-  operator BIN_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,                \
-                  const ap_private<_AP_W2, _AP_S2>& op2) {                    \
-    return ap_private<1, false>(op1).operator BIN_OP(op2);                    \
-  }                                                                           \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
-  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
-  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                      \
-                  const _private_bit_ref<_AP_W2, _AP_S2>& op2) {              \
-    return op1.operator BIN_OP(ap_private<1, false>(op2));                    \
-  }
-
-#define OP_ASSIGN_MIX_BIT(ASSIGN_OP)                           \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
-  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(       \
-      ap_private<_AP_W1, _AP_S1>& op1,                         \
-      _private_bit_ref<_AP_W2, _AP_S2>& op2) {                 \
-    return op1.operator ASSIGN_OP(ap_private<1, false>(op2));  \
-  }                                                            \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
-  INLINE _private_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \
-      _private_bit_ref<_AP_W1, _AP_S1>& op1,                   \
-      ap_private<_AP_W2, _AP_S2>& op2) {                       \
-    ap_private<1, false> tmp(op1);                             \
-    tmp.operator ASSIGN_OP(op2);                               \
-    op1 = tmp;                                                 \
-    return op1;                                                \
-  }
-
-#define OP_REL_MIX_BIT(REL_OP)                                               \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,   \
-                              const ap_private<_AP_W2, _AP_S2>& op2) {       \
-    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);              \
-  }                                                                          \
-  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
-  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,         \
-                              const _private_bit_ref<_AP_W2, _AP_S2>& op2) { \
-    return op1.operator REL_OP(ap_private<1, false>(op2));                   \
-  }
-
-OP_ASSIGN_MIX_BIT(+=)
-OP_ASSIGN_MIX_BIT(-=)
-OP_ASSIGN_MIX_BIT(*=)
-OP_ASSIGN_MIX_BIT(/=)
-OP_ASSIGN_MIX_BIT(%=)
-OP_ASSIGN_MIX_BIT(&=)
-OP_ASSIGN_MIX_BIT(|=)
-OP_ASSIGN_MIX_BIT(^=)
-OP_ASSIGN_MIX_BIT(>>=)
-OP_ASSIGN_MIX_BIT(<<=)
-#undef OP_ASSIGN_MIX_BIT
-
-OP_BIN_MIX_BIT(+, plus)
-OP_BIN_MIX_BIT(-, minus)
-OP_BIN_MIX_BIT(*, mult)
-OP_BIN_MIX_BIT(/, div)
-OP_BIN_MIX_BIT(%, mod)
-OP_BIN_MIX_BIT(&, logic)
-OP_BIN_MIX_BIT(|, logic)
-OP_BIN_MIX_BIT(^, logic)
-OP_BIN_MIX_BIT(>>, arg1)
-OP_BIN_MIX_BIT(<<, arg1)
-#undef OP_BIN_MIX_BIT
-
-OP_REL_MIX_BIT(>)
-OP_REL_MIX_BIT(<)
-OP_REL_MIX_BIT(<=)
-OP_REL_MIX_BIT(>=)
-OP_REL_MIX_BIT(==)
-OP_REL_MIX_BIT(!=)
-#undef OP_REL_MIX_BIT
-
-#define REF_REL_OP_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(const _private_range_ref<_AP_W, _AP_S>& op,   \
-                              C_TYPE op2) {                                 \
-    return (ap_private<_AP_W, false>(op))                                   \
-        .                                                                   \
-        operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));                   \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(C_TYPE op2,                                   \
-                              const _private_range_ref<_AP_W, _AP_S>& op) { \
-    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(                 \
-        ap_private<_AP_W, false>(op));                                      \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W, _AP_S>& op,     \
-                              C_TYPE op2) {                                 \
-    return (bool(op))REL_OP op2;                                            \
-  }                                                                         \
-  template <int _AP_W, bool _AP_S>                                          \
-  INLINE bool operator REL_OP(C_TYPE op2,                                   \
-                              const _private_bit_ref<_AP_W, _AP_S>& op) {   \
-    return op2 REL_OP(bool(op));                                            \
-  }
-
-#define REF_REL_MIX_INT(C_TYPE, _AP_W2, _AP_S2)      \
-  REF_REL_OP_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_REL_OP_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_REL_OP_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_REL_OP_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_REL_OP_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_REL_OP_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_REL_MIX_INT(bool, 1, false)
-REF_REL_MIX_INT(char, 8, CHAR_IS_SIGNED)
-REF_REL_MIX_INT(signed char, 8, true)
-REF_REL_MIX_INT(unsigned char, 8, false)
-REF_REL_MIX_INT(short, sizeof(short) * 8, true)
-REF_REL_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
-REF_REL_MIX_INT(int, sizeof(int) * 8, true)
-REF_REL_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
-REF_REL_MIX_INT(long, sizeof(long) * 8, true)
-REF_REL_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
-REF_REL_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
-REF_REL_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-#undef REF_REL_OP_MIX_INT
-#undef REF_REL_MIX_INT
-
-#define REF_BIN_OP_MIX_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)              \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
-      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& op,              \
-                      C_TYPE op2) {                                            \
-    return (ap_private<_AP_W, false>(op))                                      \
-        .                                                                      \
-        operator BIN_OP(ap_private<_AP_W2, _AP_S2>(op2));                      \
-  }                                                                            \
-  template <int _AP_W, bool _AP_S>                                             \
-  INLINE                                                                       \
-      typename ap_private<_AP_W2, _AP_S2>::template RType<_AP_W, false>::RTYPE \
-      operator BIN_OP(C_TYPE op2,                                              \
-                      const _private_range_ref<_AP_W, _AP_S>& op) {            \
-    return ap_private<_AP_W2, _AP_S2>(op2).operator BIN_OP(                    \
-        ap_private<_AP_W, false>(op));                                         \
-  }
-
-#define REF_BIN_MIX_INT(C_TYPE, _AP_W2, _AP_S2)            \
-  REF_BIN_OP_MIX_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_MIX_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
-  REF_BIN_OP_MIX_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
-  REF_BIN_OP_MIX_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))   \
-  REF_BIN_OP_MIX_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2)) \
-  REF_BIN_OP_MIX_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
-
-REF_BIN_MIX_INT(bool, 1, false)
-REF_BIN_MIX_INT(char, 8, CHAR_IS_SIGNED)
-REF_BIN_MIX_INT(signed char, 8, true)
-REF_BIN_MIX_INT(unsigned char, 8, false)
-REF_BIN_MIX_INT(short, sizeof(short) * 8, true)
-REF_BIN_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
-REF_BIN_MIX_INT(int, sizeof(int) * 8, true)
-REF_BIN_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
-REF_BIN_MIX_INT(long, sizeof(long) * 8, true)
-REF_BIN_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
-REF_BIN_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
-REF_BIN_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
-#undef REF_BIN_OP_MIX_INT
-#undef REF_BIN_MIX_INT
-
-#define REF_BIN_OP(BIN_OP, RTYPE)                                             \
-  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
-  INLINE                                                                      \
-      typename ap_private<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
-      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& lhs,            \
-                      const _private_range_ref<_AP_W2, _AP_S2>& rhs) {        \
-    return ap_private<_AP_W, false>(lhs).operator BIN_OP(                     \
-        ap_private<_AP_W2, false>(rhs));                                      \
-  }
-
-REF_BIN_OP(+, plus)
-REF_BIN_OP(-, minus)
-REF_BIN_OP(*, mult)
-REF_BIN_OP(/, div)
-REF_BIN_OP(%, mod)
-REF_BIN_OP(&, logic)
-REF_BIN_OP(|, logic)
-REF_BIN_OP(^, logic)
-REF_BIN_OP(>>, arg1)
-REF_BIN_OP(<<, arg1)
-#undef REF_BIN_OP
-
-//************************************************************************
-//  Implement
-//      ap_private<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
-//  for operators  +, -, *, /, %, >>, <<, &, |, ^
-//  Without these operators the operands are converted to int64 and
-//  larger results lose informations (higher order bits).
-//
-//                       operand OP
-//                      /          |
-//              left-concat        right-concat
-//                /     |           /         |
-//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>   <RW2,RT2>
-//
-//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
-//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
-//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
-//
-//  In Verilog 2001 result of concatenation is always unsigned even
-//  when both sides are signed.
-//************************************************************************
-
-#endif // ifndef __AP_PRIVATE_H__
-
-// -*- cpp -*-
+/*
+ * Copyright 2011-2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __AP_PRIVATE_H__
+#define __AP_PRIVATE_H__
+
+// common macros and type declarations are now defined in ap_common.h, and
+// ap_private becomes part of it.
+#ifndef __AP_COMMON_H__
+#error "etc/ap_private.h cannot be included directly."
+#endif
+
+// forward declarations
+//template <int _AP_W, bool _AP_S, bool _AP_C = _AP_W <= 64>
+//class ap_private; // moved to ap_common.h
+template <int _AP_W, bool _AP_S>
+struct _private_range_ref;
+template <int _AP_W, bool _AP_S>
+struct _private_bit_ref;
+
+// TODO clean up this part.
+#ifndef LLVM_SUPPORT_MATHEXTRAS_H
+#define LLVM_SUPPORT_MATHEXTRAS_H
+
+#ifdef _MSC_VER
+#if _MSC_VER <= 1500
+typedef __int8 int8_t;
+typedef unsigned __int8 uint8_t;
+typedef __int16 int16_t;
+typedef unsigned __int16 uint16_t;
+typedef __int32 int32_t;
+typedef unsigned __int32 uint32_t;
+typedef __int64 int64_t;
+typedef unsigned __int64 uint64_t;
+#else
+#include <stdint.h>
+#endif
+#else
+#include <stdint.h>
+#endif
+
+#ifndef INLINE
+#define INLINE inline
+// Enable to debug ap_int/ap_fixed
+// #define INLINE  __attribute__((weak))
+#endif
+
+// NOTE: The following support functions use the _32/_64 extensions instead of
+// type overloading so that signed and unsigned integers can be used without
+// ambiguity.
+namespace AESL_std {
+template <class DataType>
+DataType INLINE min(DataType a, DataType b) {
+  return (a >= b) ? b : a;
+}
+
+template <class DataType>
+DataType INLINE max(DataType a, DataType b) {
+  return (a >= b) ? a : b;
+}
+} // namespace AESL_std
+
+// TODO clean up included headers.
+#include <math.h>
+#include <stdio.h>
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+#include <iomanip>
+#include <limits>
+#include <sstream>
+#include <string>
+
+namespace ap_private_ops {
+/// Hi_32 - This function returns the high 32 bits of a 64 bit value.
+static INLINE uint32_t Hi_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value >> 32);
+}
+
+/// Lo_32 - This function returns the low 32 bits of a 64 bit value.
+static INLINE uint32_t Lo_32(uint64_t Value) {
+  return static_cast<uint32_t>(Value);
+}
+
+template <int _AP_W>
+INLINE bool isNegative(const ap_private<_AP_W, false>& a) {
+  return false;
+}
+
+template <int _AP_W>
+INLINE bool isNegative(const ap_private<_AP_W, true>& a) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    _AP_N = (_AP_W + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD
+  };
+  static const uint64_t sign_mask = 1ULL << ((_AP_W - 1) % APINT_BITS_PER_WORD);
+  return (sign_mask & a.get_pVal(_AP_N - 1)) != 0;
+}
+
+/// CountLeadingZeros_32 - this function performs the platform optimal form of
+/// counting the number of zeros from the most significant bit to the first one
+/// bit.  Ex. CountLeadingZeros_32(0x00F000FF) == 8.
+/// Returns 32 if the word is zero.
+static INLINE unsigned CountLeadingZeros_32(uint32_t Value) {
+  unsigned Count; // result
+#if __GNUC__ >= 4
+// PowerPC is defined for __builtin_clz(0)
+#if !defined(__ppc__) && !defined(__ppc64__)
+  if (Value == 0) return 32;
+#endif
+  Count = __builtin_clz(Value);
+#else
+  if (Value == 0) return 32;
+  Count = 0;
+  // bisecton method for count leading zeros
+  for (unsigned Shift = 32 >> 1; Shift; Shift >>= 1) {
+    uint32_t Tmp = (Value) >> (Shift);
+    if (Tmp) {
+      Value = Tmp;
+    } else {
+      Count |= Shift;
+    }
+  }
+#endif
+  return Count;
+}
+
+/// CountLeadingZeros_64 - This function performs the platform optimal form
+/// of counting the number of zeros from the most significant bit to the first
+/// one bit (64 bit edition.)
+/// Returns 64 if the word is zero.
+static INLINE unsigned CountLeadingZeros_64(uint64_t Value) {
+  unsigned Count; // result
+#if __GNUC__ >= 4
+// PowerPC is defined for __builtin_clzll(0)
+#if !defined(__ppc__) && !defined(__ppc64__)
+  if (!Value) return 64;
+#endif
+  Count = __builtin_clzll(Value);
+#else
+  if (sizeof(long) == sizeof(int64_t)) {
+    if (!Value) return 64;
+    Count = 0;
+    // bisecton method for count leading zeros
+    for (unsigned Shift = 64 >> 1; Shift; Shift >>= 1) {
+      uint64_t Tmp = (Value) >> (Shift);
+      if (Tmp) {
+        Value = Tmp;
+      } else {
+        Count |= Shift;
+      }
+    }
+  } else {
+    // get hi portion
+    uint32_t Hi = Hi_32(Value);
+
+    // if some bits in hi portion
+    if (Hi) {
+      // leading zeros in hi portion plus all bits in lo portion
+      Count = CountLeadingZeros_32(Hi);
+    } else {
+      // get lo portion
+      uint32_t Lo = Lo_32(Value);
+      // same as 32 bit value
+      Count = CountLeadingZeros_32(Lo) + 32;
+    }
+  }
+#endif
+  return Count;
+}
+
+/// CountTrailingZeros_64 - This function performs the platform optimal form
+/// of counting the number of zeros from the least significant bit to the first
+/// one bit (64 bit edition.)
+/// Returns 64 if the word is zero.
+static INLINE unsigned CountTrailingZeros_64(uint64_t Value) {
+#if __GNUC__ >= 4
+  return (Value != 0) ? __builtin_ctzll(Value) : 64;
+#else
+  static const unsigned Mod67Position[] = {
+      64, 0,  1,  39, 2,  15, 40, 23, 3,  12, 16, 59, 41, 19, 24, 54, 4,
+      64, 13, 10, 17, 62, 60, 28, 42, 30, 20, 51, 25, 44, 55, 47, 5,  32,
+      65, 38, 14, 22, 11, 58, 18, 53, 63, 9,  61, 27, 29, 50, 43, 46, 31,
+      37, 21, 57, 52, 8,  26, 49, 45, 36, 56, 7,  48, 35, 6,  34, 33, 0};
+  return Mod67Position[(uint64_t)(-(int64_t)Value & (int64_t)Value) % 67];
+#endif
+}
+
+/// CountPopulation_64 - this function counts the number of set bits in a value,
+/// (64 bit edition.)
+static INLINE unsigned CountPopulation_64(uint64_t Value) {
+#if __GNUC__ >= 4
+  return __builtin_popcountll(Value);
+#else
+  uint64_t v = Value - (((Value) >> 1) & 0x5555555555555555ULL);
+  v = (v & 0x3333333333333333ULL) + (((v) >> 2) & 0x3333333333333333ULL);
+  v = (v + ((v) >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
+  return unsigned((uint64_t)(v * 0x0101010101010101ULL) >> 56);
+#endif
+}
+
+static INLINE uint32_t countLeadingOnes_64(uint64_t __V, uint32_t skip) {
+  uint32_t Count = 0;
+  if (skip) (__V) <<= (skip);
+  while (__V && (__V & (1ULL << 63))) {
+    Count++;
+    (__V) <<= 1;
+  }
+  return Count;
+}
+
+static INLINE std::string oct2Bin(char oct) {
+  switch (oct) {
+    case '\0': {
+      return "";
+    }
+    case '.': {
+      return ".";
+    }
+    case '0': {
+      return "000";
+    }
+    case '1': {
+      return "001";
+    }
+    case '2': {
+      return "010";
+    }
+    case '3': {
+      return "011";
+    }
+    case '4': {
+      return "100";
+    }
+    case '5': {
+      return "101";
+    }
+    case '6': {
+      return "110";
+    }
+    case '7': {
+      return "111";
+    }
+  }
+  assert(0 && "Invalid character in digit string");
+  return "";
+}
+
+static INLINE std::string hex2Bin(char hex) {
+  switch (hex) {
+    case '\0': {
+      return "";
+    }
+    case '.': {
+      return ".";
+    }
+    case '0': {
+      return "0000";
+    }
+    case '1': {
+      return "0001";
+    }
+    case '2': {
+      return "0010";
+    }
+    case '3': {
+      return "0011";
+    }
+    case '4': {
+      return "0100";
+    }
+    case '5': {
+      return "0101";
+    }
+    case '6': {
+      return "0110";
+    }
+    case '7': {
+      return "0111";
+    }
+    case '8': {
+      return "1000";
+    }
+    case '9': {
+      return "1001";
+    }
+    case 'A':
+    case 'a': {
+      return "1010";
+    }
+    case 'B':
+    case 'b': {
+      return "1011";
+    }
+    case 'C':
+    case 'c': {
+      return "1100";
+    }
+    case 'D':
+    case 'd': {
+      return "1101";
+    }
+    case 'E':
+    case 'e': {
+      return "1110";
+    }
+    case 'F':
+    case 'f': {
+      return "1111";
+    }
+  }
+  assert(0 && "Invalid character in digit string");
+  return "";
+}
+
+static INLINE uint32_t decode_digit(char cdigit, int radix) {
+  uint32_t digit = 0;
+  if (radix == 16) {
+#define isxdigit(c)                                            \
+  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
+   ((c) >= 'A' && (c) <= 'F'))
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+    if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
+    if (isdigit(cdigit))
+      digit = cdigit - '0';
+    else if (cdigit >= 'a')
+      digit = cdigit - 'a' + 10;
+    else if (cdigit >= 'A')
+      digit = cdigit - 'A' + 10;
+    else
+      assert(0 && "huh? we shouldn't get here");
+  } else if (isdigit(cdigit)) {
+    digit = cdigit - '0';
+  } else {
+    assert(0 && "Invalid character in digit string");
+  }
+#undef isxdigit
+#undef isdigit
+  return digit;
+}
+
+// Determine the radix of "val".
+static INLINE std::string parseString(const std::string& input, unsigned char& radix) {
+  size_t len = input.length();
+  if (len == 0) {
+    if (radix == 0) radix = 10;
+    return input;
+  }
+
+  size_t startPos = 0;
+  // Trim whitespace
+  while (input[startPos] == ' ' && startPos < len) startPos++;
+  while (input[len - 1] == ' ' && startPos < len) len--;
+
+  std::string val = input.substr(startPos, len - startPos);
+  // std::cout << "val = " << val << "\n";
+  len = val.length();
+  startPos = 0;
+
+  // If the length of the string is less than 2, then radix
+  // is decimal and there is no exponent.
+  if (len < 2) {
+    if (radix == 0) radix = 10;
+    return val;
+  }
+
+  bool isNegative = false;
+  std::string ans;
+
+  // First check to see if we start with a sign indicator
+  if (val[0] == '-') {
+    ans = "-";
+    ++startPos;
+    isNegative = true;
+  } else if (val[0] == '+')
+    ++startPos;
+
+  if (len - startPos < 2) {
+    if (radix == 0) radix = 10;
+    return val;
+  }
+
+  if (val.substr(startPos, 2) == "0x" || val.substr(startPos, 2) == "0X") {
+    // If we start with "0x", then the radix is hex.
+    radix = 16;
+    startPos += 2;
+  } else if (val.substr(startPos, 2) == "0b" ||
+             val.substr(startPos, 2) == "0B") {
+    // If we start with "0b", then the radix is binary.
+    radix = 2;
+    startPos += 2;
+  } else if (val.substr(startPos, 2) == "0o" ||
+             val.substr(startPos, 2) == "0O") {
+    // If we start with "0o", then the radix is octal.
+    radix = 8;
+    startPos += 2;
+  } else if (radix == 0) {
+    radix = 10;
+  }
+
+  int exp = 0;
+  if (radix == 10) {
+    // If radix is decimal, then see if there is an
+    // exponent indicator.
+    size_t expPos = val.find('e');
+    bool has_exponent = true;
+    if (expPos == std::string::npos) expPos = val.find('E');
+    if (expPos == std::string::npos) {
+      // No exponent indicator, so the mantissa goes to the end.
+      expPos = len;
+      has_exponent = false;
+    }
+    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
+
+    ans += val.substr(startPos, expPos - startPos);
+    if (has_exponent) {
+      // Parse the exponent.
+      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
+      iss >> exp;
+    }
+  } else {
+    // Check for a binary exponent indicator.
+    size_t expPos = val.find('p');
+    bool has_exponent = true;
+    if (expPos == std::string::npos) expPos = val.find('P');
+    if (expPos == std::string::npos) {
+      // No exponent indicator, so the mantissa goes to the end.
+      expPos = len;
+      has_exponent = false;
+    }
+
+    // std::cout << "startPos = " << startPos << " " << expPos << "\n";
+
+    assert(startPos <= expPos);
+    // Convert to binary as we go.
+    for (size_t i = startPos; i < expPos; ++i) {
+      if (radix == 16) {
+        ans += hex2Bin(val[i]);
+      } else if (radix == 8) {
+        ans += oct2Bin(val[i]);
+      } else { // radix == 2
+        ans += val[i];
+      }
+    }
+    // End in binary
+    radix = 2;
+    if (has_exponent) {
+      // Parse the exponent.
+      std::istringstream iss(val.substr(expPos + 1, len - expPos - 1));
+      iss >> exp;
+    }
+  }
+  if (exp == 0) return ans;
+
+  size_t decPos = ans.find('.');
+  if (decPos == std::string::npos) decPos = ans.length();
+  if ((int)decPos + exp >= (int)ans.length()) {
+    int i = decPos;
+    for (; i < (int)ans.length() - 1; ++i) ans[i] = ans[i + 1];
+    for (; i < (int)ans.length(); ++i) ans[i] = '0';
+    for (; i < (int)decPos + exp; ++i) ans += '0';
+    return ans;
+  } else if ((int)decPos + exp < (int)isNegative) {
+    std::string dupAns = "0.";
+    if (ans[0] == '-') dupAns = "-0.";
+    for (int i = 0; i < isNegative - (int)decPos - exp; ++i) dupAns += '0';
+    for (size_t i = isNegative; i < ans.length(); ++i)
+      if (ans[i] != '.') dupAns += ans[i];
+    return dupAns;
+  }
+
+  if (exp > 0)
+    for (size_t i = decPos; i < decPos + exp; ++i) ans[i] = ans[i + 1];
+  else {
+    if (decPos == ans.length()) ans += ' ';
+    for (int i = decPos; i > (int)decPos + exp; --i) ans[i] = ans[i - 1];
+  }
+  ans[decPos + exp] = '.';
+  return ans;
+}
+
+/// sub_1 - This function subtracts a single "digit" (64-bit word), y, from
+/// the multi-digit integer array, x[], propagating the borrowed 1 value until
+/// no further borrowing is neeeded or it runs out of "digits" in x.  The result
+/// is 1 if "borrowing" exhausted the digits in x, or 0 if x was not exhausted.
+/// In other words, if y > x then this function returns 1, otherwise 0.
+/// @returns the borrow out of the subtraction
+static INLINE bool sub_1(uint64_t x[], uint32_t len, uint64_t y) {
+  for (uint32_t i = 0; i < len; ++i) {
+    uint64_t __X = x[i];
+    x[i] -= y;
+    if (y > __X)
+      y = 1; // We have to "borrow 1" from next "digit"
+    else {
+      y = 0; // No need to borrow
+      break; // Remaining digits are unchanged so exit early
+    }
+  }
+  return (y != 0);
+}
+
+/// add_1 - This function adds a single "digit" integer, y, to the multiple
+/// "digit" integer array,  x[]. x[] is modified to reflect the addition and
+/// 1 is returned if there is a carry out, otherwise 0 is returned.
+/// @returns the carry of the addition.
+static INLINE bool add_1(uint64_t dest[], uint64_t x[], uint32_t len,
+                         uint64_t y) {
+  for (uint32_t i = 0; i < len; ++i) {
+    dest[i] = y + x[i];
+    if (dest[i] < y)
+      y = 1; // Carry one to next digit.
+    else {
+      y = 0; // No need to carry so exit early
+      break;
+    }
+  }
+  return (y != 0);
+}
+
+/// add - This function adds the integer array x to the integer array Y and
+/// places the result in dest.
+/// @returns the carry out from the addition
+/// @brief General addition of 64-bit integer arrays
+static INLINE bool add(uint64_t* dest, const uint64_t* x, const uint64_t* y,
+                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
+                       bool xsigned, bool ysigned) {
+  bool carry = false;
+  uint32_t len = AESL_std::min(xlen, ylen);
+  uint32_t i;
+  for (i = 0; i < len && i < destlen; ++i) {
+    uint64_t limit =
+        AESL_std::min(x[i], y[i]); // must come first in case dest == x
+    dest[i] = x[i] + y[i] + carry;
+    carry = dest[i] < limit || (carry && dest[i] == limit);
+  }
+  if (xlen > ylen) {
+    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
+    for (i = ylen; i < xlen && i < destlen; i++) {
+      uint64_t limit = AESL_std::min(x[i], yext);
+      dest[i] = x[i] + yext + carry;
+      carry = (dest[i] < limit) || (carry && dest[i] == limit);
+    }
+  } else if (ylen > xlen) {
+    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
+    for (i = xlen; i < ylen && i < destlen; i++) {
+      uint64_t limit = AESL_std::min(xext, y[i]);
+      dest[i] = xext + y[i] + carry;
+      carry = (dest[i] < limit) || (carry && dest[i] == limit);
+    }
+  }
+  return carry;
+}
+
+/// @returns returns the borrow out.
+/// @brief Generalized subtraction of 64-bit integer arrays.
+static INLINE bool sub(uint64_t* dest, const uint64_t* x, const uint64_t* y,
+                       uint32_t destlen, uint32_t xlen, uint32_t ylen,
+                       bool xsigned, bool ysigned) {
+  bool borrow = false;
+  uint32_t i;
+  uint32_t len = AESL_std::min(xlen, ylen);
+  for (i = 0; i < len && i < destlen; ++i) {
+    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
+    dest[i] = x_tmp - y[i];
+  }
+  if (xlen > ylen) {
+    const uint64_t yext = ysigned && int64_t(y[ylen - 1]) < 0 ? -1 : 0;
+    for (i = ylen; i < xlen && i < destlen; i++) {
+      uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
+      borrow = yext > x_tmp || (borrow && x[i] == 0);
+      dest[i] = x_tmp - yext;
+    }
+  } else if (ylen > xlen) {
+    const uint64_t xext = xsigned && int64_t(x[xlen - 1]) < 0 ? -1 : 0;
+    for (i = xlen; i < ylen && i < destlen; i++) {
+      uint64_t x_tmp = borrow ? xext - 1 : xext;
+      borrow = y[i] > x_tmp || (borrow && xext == 0);
+      dest[i] = x_tmp - y[i];
+    }
+  }
+  return borrow;
+}
+
+/// Subtracts the RHS ap_private from this ap_private
+/// @returns this, after subtraction
+/// @brief Subtraction assignment operator.
+
+/// Multiplies an integer array, x by a a uint64_t integer and places the result
+/// into dest.
+/// @returns the carry out of the multiplication.
+/// @brief Multiply a multi-digit ap_private by a single digit (64-bit) integer.
+static INLINE uint64_t mul_1(uint64_t dest[], const uint64_t x[], uint32_t len,
+                             uint64_t y) {
+  // Split y into high 32-bit part (hy)  and low 32-bit part (ly)
+  uint64_t ly = y & 0xffffffffULL, hy = (y) >> 32;
+  uint64_t carry = 0;
+  static const uint64_t two_power_32 = 1ULL << 32;
+  // For each digit of x.
+  for (uint32_t i = 0; i < len; ++i) {
+    // Split x into high and low words
+    uint64_t lx = x[i] & 0xffffffffULL;
+    uint64_t hx = (x[i]) >> 32;
+    // hasCarry - A flag to indicate if there is a carry to the next digit.
+    // hasCarry == 0, no carry
+    // hasCarry == 1, has carry
+    // hasCarry == 2, no carry and the calculation result == 0.
+    uint8_t hasCarry = 0;
+    dest[i] = carry + lx * ly;
+    // Determine if the add above introduces carry.
+    hasCarry = (dest[i] < carry) ? 1 : 0;
+    carry = hx * ly + ((dest[i]) >> 32) + (hasCarry ? two_power_32 : 0);
+    // The upper limit of carry can be (2^32 - 1)(2^32 - 1) +
+    // (2^32 - 1) + 2^32 = 2^64.
+    hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+
+    carry += (lx * hy) & 0xffffffffULL;
+    dest[i] = ((carry) << 32) | (dest[i] & 0xffffffffULL);
+    carry = (((!carry && hasCarry != 2) || hasCarry == 1) ? two_power_32 : 0) +
+            ((carry) >> 32) + ((lx * hy) >> 32) + hx * hy;
+  }
+  return carry;
+}
+
+/// Multiplies integer array x by integer array y and stores the result into
+/// the integer array dest. Note that dest's size must be >= xlen + ylen in
+/// order to
+/// do a full precision computation. If it is not, then only the low-order words
+/// are returned.
+/// @brief Generalized multiplicate of integer arrays.
+static INLINE void mul(uint64_t dest[], const uint64_t x[], uint32_t xlen,
+                       const uint64_t y[], uint32_t ylen, uint32_t destlen) {
+  assert(xlen > 0);
+  assert(ylen > 0);
+  assert(destlen >= xlen + ylen);
+  if (xlen < destlen) dest[xlen] = mul_1(dest, x, xlen, y[0]);
+  for (uint32_t i = 1; i < ylen; ++i) {
+    uint64_t ly = y[i] & 0xffffffffULL, hy = (y[i]) >> 32;
+    uint64_t carry = 0, lx = 0, hx = 0;
+    for (uint32_t j = 0; j < xlen; ++j) {
+      lx = x[j] & 0xffffffffULL;
+      hx = (x[j]) >> 32;
+      // hasCarry - A flag to indicate if has carry.
+      // hasCarry == 0, no carry
+      // hasCarry == 1, has carry
+      // hasCarry == 2, no carry and the calculation result == 0.
+      uint8_t hasCarry = 0;
+      uint64_t resul = carry + lx * ly;
+      hasCarry = (resul < carry) ? 1 : 0;
+      carry = (hasCarry ? (1ULL << 32) : 0) + hx * ly + ((resul) >> 32);
+      hasCarry = (!carry && hasCarry) ? 1 : (!carry ? 2 : 0);
+      carry += (lx * hy) & 0xffffffffULL;
+      resul = ((carry) << 32) | (resul & 0xffffffffULL);
+      if (i + j < destlen) dest[i + j] += resul;
+      carry =
+          (((!carry && hasCarry != 2) || hasCarry == 1) ? (1ULL << 32) : 0) +
+          ((carry) >> 32) + (dest[i + j] < resul ? 1 : 0) + ((lx * hy) >> 32) +
+          hx * hy;
+    }
+    if (i + xlen < destlen) dest[i + xlen] = carry;
+  }
+}
+
+/// Implementation of Knuth's Algorithm D (Division of nonnegative integers)
+/// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The
+/// variables here have the same names as in the algorithm. Comments explain
+/// the algorithm and any deviation from it.
+static INLINE void KnuthDiv(uint32_t* u, uint32_t* v, uint32_t* q, uint32_t* r,
+                            uint32_t m, uint32_t n) {
+  assert(u && "Must provide dividend");
+  assert(v && "Must provide divisor");
+  assert(q && "Must provide quotient");
+  assert(u != v && u != q && v != q && "Must us different memory");
+  assert(n > 1 && "n must be > 1");
+
+  // Knuth uses the value b as the base of the number system. In our case b
+  // is 2^31 so we just set it to -1u.
+  uint64_t b = uint64_t(1) << 32;
+
+  // DEBUG(cerr << "KnuthDiv: m=" << m << " n=" << n << '\n');
+  // DEBUG(cerr << "KnuthDiv: original:");
+  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
+  // u[i]);
+  // DEBUG(cerr << " by");
+  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
+  // v[i-1]);
+  // DEBUG(cerr << '\n');
+  // D1. [Normalize.] Set d = b / (v[n-1] + 1) and multiply all the digits of
+  // u and v by d. Note that we have taken Knuth's advice here to use a power
+  // of 2 value for d such that d * v[n-1] >= b/2 (b is the base). A power of
+  // 2 allows us to shift instead of multiply and it is easy to determine the
+  // shift amount from the leading zeros.  We are basically normalizing the u
+  // and v so that its high bits are shifted to the top of v's range without
+  // overflow. Note that this can require an extra word in u so that u must
+  // be of length m+n+1.
+  uint32_t shift = CountLeadingZeros_32(v[n - 1]);
+  uint32_t v_carry = 0;
+  uint32_t u_carry = 0;
+  if (shift) {
+    for (uint32_t i = 0; i < m + n; ++i) {
+      uint32_t u_tmp = (u[i]) >> (32 - shift);
+      u[i] = ((u[i]) << (shift)) | u_carry;
+      u_carry = u_tmp;
+    }
+    for (uint32_t i = 0; i < n; ++i) {
+      uint32_t v_tmp = (v[i]) >> (32 - shift);
+      v[i] = ((v[i]) << (shift)) | v_carry;
+      v_carry = v_tmp;
+    }
+  }
+  u[m + n] = u_carry;
+  // DEBUG(cerr << "KnuthDiv:   normal:");
+  // DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << std::setbase(16) <<
+  // u[i]);
+  // DEBUG(cerr << " by");
+  // DEBUG(for (int i = n; i >0; i--) cerr << " " << std::setbase(16) <<
+  // v[i-1]);
+  // DEBUG(cerr << '\n');
+
+  // D2. [Initialize j.]  Set j to m. This is the loop counter over the places.
+  int j = m;
+  do {
+    // DEBUG(cerr << "KnuthDiv: quotient digit #" << j << '\n');
+    // D3. [Calculate q'.].
+    //     Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q')
+    //     Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r')
+    // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease
+    // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test
+    // on v[n-2] determines at high speed most of the cases in which the trial
+    // value qp is one too large, and it eliminates all cases where qp is two
+    // too large.
+    uint64_t dividend = ((uint64_t(u[j + n]) << 32) + u[j + n - 1]);
+    // DEBUG(cerr << "KnuthDiv: dividend == " << dividend << '\n');
+    uint64_t qp = dividend / v[n - 1];
+    uint64_t rp = dividend % v[n - 1];
+    if (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2]) {
+      qp--;
+      rp += v[n - 1];
+      if (rp < b && (qp == b || qp * v[n - 2] > b * rp + u[j + n - 2])) qp--;
+    }
+    // DEBUG(cerr << "KnuthDiv: qp == " << qp << ", rp == " << rp << '\n');
+
+    // D4. [Multiply and subtract.] Replace (u[j+n]u[j+n-1]...u[j]) with
+    // (u[j+n]u[j+n-1]..u[j]) - qp * (v[n-1]...v[1]v[0]). This computation
+    // consists of a simple multiplication by a one-place number, combined with
+    // a subtraction.
+    bool isNeg = false;
+    for (uint32_t i = 0; i < n; ++i) {
+      uint64_t u_tmp = uint64_t(u[j + i]) | ((uint64_t(u[j + i + 1])) << 32);
+      uint64_t subtrahend = uint64_t(qp) * uint64_t(v[i]);
+      bool borrow = subtrahend > u_tmp;
+      /*DEBUG(cerr << "KnuthDiv: u_tmp == " << u_tmp
+        << ", subtrahend == " << subtrahend
+        << ", borrow = " << borrow << '\n');*/
+
+      uint64_t result = u_tmp - subtrahend;
+      uint32_t k = j + i;
+      u[k++] = (uint32_t)(result & (b - 1)); // subtract low word
+      u[k++] = (uint32_t)((result) >> 32);   // subtract high word
+      while (borrow && k <= m + n) {         // deal with borrow to the left
+        borrow = u[k] == 0;
+        u[k]--;
+        k++;
+      }
+      isNeg |= borrow;
+      /*DEBUG(cerr << "KnuthDiv: u[j+i] == " << u[j+i] << ",  u[j+i+1] == " <<
+        u[j+i+1] << '\n');*/
+    }
+    /*DEBUG(cerr << "KnuthDiv: after subtraction:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+      DEBUG(cerr << '\n');*/
+    // The digits (u[j+n]...u[j]) should be kept positive; if the result of
+    // this step is actually negative, (u[j+n]...u[j]) should be left as the
+    // true value plus b**(n+1), namely as the b's complement of
+    // the true value, and a "borrow" to the left should be remembered.
+    //
+    if (isNeg) {
+      bool carry = true; // true because b's complement is "complement + 1"
+      for (uint32_t i = 0; i <= m + n; ++i) {
+        u[i] = ~u[i] + carry; // b's complement
+        carry = carry && u[i] == 0;
+      }
+    }
+    /*DEBUG(cerr << "KnuthDiv: after complement:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr << " " << u[i]);
+      DEBUG(cerr << '\n');*/
+
+    // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was
+    // negative, go to step D6; otherwise go on to step D7.
+    q[j] = (uint32_t)qp;
+    if (isNeg) {
+      // D6. [Add back]. The probability that this step is necessary is very
+      // small, on the order of only 2/b. Make sure that test data accounts for
+      // this possibility. Decrease q[j] by 1
+      q[j]--;
+      // and add (0v[n-1]...v[1]v[0]) to (u[j+n]u[j+n-1]...u[j+1]u[j]).
+      // A carry will occur to the left of u[j+n], and it should be ignored
+      // since it cancels with the borrow that occurred in D4.
+      bool carry = false;
+      for (uint32_t i = 0; i < n; i++) {
+        uint32_t limit = AESL_std::min(u[j + i], v[i]);
+        u[j + i] += v[i] + carry;
+        carry = u[j + i] < limit || (carry && u[j + i] == limit);
+      }
+      u[j + n] += carry;
+    }
+    /*DEBUG(cerr << "KnuthDiv: after correction:");
+      DEBUG(for (int i = m+n; i >=0; i--) cerr <<" " << u[i]);
+      DEBUG(cerr << "\nKnuthDiv: digit result = " << q[j] << '\n');*/
+
+    // D7. [Loop on j.]  Decrease j by one. Now if j >= 0, go back to D3.
+  } while (--j >= 0);
+
+  /*DEBUG(cerr << "KnuthDiv: quotient:");
+    DEBUG(for (int i = m; i >=0; i--) cerr <<" " << q[i]);
+    DEBUG(cerr << '\n');*/
+
+  // D8. [Unnormalize]. Now q[...] is the desired quotient, and the desired
+  // remainder may be obtained by dividing u[...] by d. If r is non-null we
+  // compute the remainder (urem uses this).
+  if (r) {
+    // The value d is expressed by the "shift" value above since we avoided
+    // multiplication by d by using a shift left. So, all we have to do is
+    // shift right here. In order to mak
+    if (shift) {
+      uint32_t carry = 0;
+      // DEBUG(cerr << "KnuthDiv: remainder:");
+      for (int i = n - 1; i >= 0; i--) {
+        r[i] = ((u[i]) >> (shift)) | carry;
+        carry = (u[i]) << (32 - shift);
+        // DEBUG(cerr << " " << r[i]);
+      }
+    } else {
+      for (int i = n - 1; i >= 0; i--) {
+        r[i] = u[i];
+        // DEBUG(cerr << " " << r[i]);
+      }
+    }
+    // DEBUG(cerr << '\n');
+  }
+  // DEBUG(cerr << std::setbase(10) << '\n');
+}
+
+template <int _AP_W, bool _AP_S>
+void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
+            const ap_private<_AP_W, _AP_S>& RHS, uint32_t rhsWords,
+            ap_private<_AP_W, _AP_S>* Quotient,
+            ap_private<_AP_W, _AP_S>* Remainder) {
+  assert(lhsWords >= rhsWords && "Fractional result");
+  enum { APINT_BITS_PER_WORD = 64 };
+  // First, compose the values into an array of 32-bit words instead of
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the the Knuth "classical algorithm" which requires there to be native
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We
+  // can't use 64-bit operands here because we don't have native results of
+  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
+  uint32_t n = rhsWords * 2;
+  uint32_t m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  uint32_t SPACE[128];
+  uint32_t* __U = 0;
+  uint32_t* __V = 0;
+  uint32_t* __Q = 0;
+  uint32_t* __R = 0;
+  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
+    __U = &SPACE[0];
+    __V = &SPACE[m + n + 1];
+    __Q = &SPACE[(m + n + 1) + n];
+    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
+  } else {
+    __U = new uint32_t[m + n + 1];
+    __V = new uint32_t[n];
+    __Q = new uint32_t[m + n];
+    if (Remainder) __R = new uint32_t[n];
+  }
+
+  // Initialize the dividend
+  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = LHS.get_pVal(i);
+    __U[i * 2] = (uint32_t)(tmp & mask);
+    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(__V, 0, (n) * sizeof(uint32_t));
+  for (unsigned i = 0; i < rhsWords; ++i) {
+    uint64_t tmp = RHS.get_pVal(i);
+    __V[i * 2] = (uint32_t)(tmp & mask);
+    __V[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+
+  // initialize the quotient and remainder
+  memset(__Q, 0, (m + n) * sizeof(uint32_t));
+  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    uint32_t divisor = __V[0];
+    uint32_t remainder = 0;
+    for (int i = m + n - 1; i >= 0; i--) {
+      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
+      if (partial_dividend == 0) {
+        __Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        __Q[i] = 0;
+        remainder = (uint32_t)partial_dividend;
+      } else if (partial_dividend == divisor) {
+        __Q[i] = 1;
+        remainder = 0;
+      } else {
+        __Q[i] = (uint32_t)(partial_dividend / divisor);
+        remainder = (uint32_t)(partial_dividend - (__Q[i] * divisor));
+      }
+    }
+    if (__R) __R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(__U, __V, __Q, __R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
+    } else
+      Quotient->clear();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
+      Quotient->set_VAL(tmp);
+    } else {
+      assert(!Quotient->isSingleWord() &&
+             "Quotient ap_private not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->set_pVal(
+            i, uint64_t(__Q[i * 2]) |
+                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Quotient->clearUnusedBits();
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != RHS.BitWidth) {
+      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
+    } else
+      Remainder->clear();
+
+    // The remainder is in R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
+      Remainder->set_VAL(tmp);
+    } else {
+      assert(!Remainder->isSingleWord() &&
+             "Remainder ap_private not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->set_pVal(
+            i, uint64_t(__R[i * 2]) |
+                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Remainder->clearUnusedBits();
+  }
+
+  // Clean up the memory we allocated.
+  if (__U != &SPACE[0]) {
+    delete[] __U;
+    delete[] __V;
+    delete[] __Q;
+    delete[] __R;
+  }
+}
+
+template <int _AP_W, bool _AP_S>
+void divide(const ap_private<_AP_W, _AP_S>& LHS, uint32_t lhsWords,
+            uint64_t RHS, ap_private<_AP_W, _AP_S>* Quotient,
+            ap_private<_AP_W, _AP_S>* Remainder) {
+  uint32_t rhsWords = 1;
+  assert(lhsWords >= rhsWords && "Fractional result");
+  enum { APINT_BITS_PER_WORD = 64 };
+  // First, compose the values into an array of 32-bit words instead of
+  // 64-bit words. This is a necessity of both the "short division" algorithm
+  // and the the Knuth "classical algorithm" which requires there to be native
+  // operations for +, -, and * on an m bit value with an m*2 bit result. We
+  // can't use 64-bit operands here because we don't have native results of
+  // 128-bits. Furthremore, casting the 64-bit values to 32-bit values won't
+  // work on large-endian machines.
+  uint64_t mask = ~0ull >> (sizeof(uint32_t) * 8);
+  uint32_t n = 2;
+  uint32_t m = (lhsWords * 2) - n;
+
+  // Allocate space for the temporary values we need either on the stack, if
+  // it will fit, or on the heap if it won't.
+  uint32_t SPACE[128];
+  uint32_t* __U = 0;
+  uint32_t* __V = 0;
+  uint32_t* __Q = 0;
+  uint32_t* __R = 0;
+  if ((Remainder ? 4 : 3) * n + 2 * m + 1 <= 128) {
+    __U = &SPACE[0];
+    __V = &SPACE[m + n + 1];
+    __Q = &SPACE[(m + n + 1) + n];
+    if (Remainder) __R = &SPACE[(m + n + 1) + n + (m + n)];
+  } else {
+    __U = new uint32_t[m + n + 1];
+    __V = new uint32_t[n];
+    __Q = new uint32_t[m + n];
+    if (Remainder) __R = new uint32_t[n];
+  }
+
+  // Initialize the dividend
+  memset(__U, 0, (m + n + 1) * sizeof(uint32_t));
+  for (unsigned i = 0; i < lhsWords; ++i) {
+    uint64_t tmp = LHS.get_pVal(i);
+    __U[i * 2] = tmp & mask;
+    __U[i * 2 + 1] = (tmp) >> (sizeof(uint32_t) * 8);
+  }
+  __U[m + n] = 0; // this extra word is for "spill" in the Knuth algorithm.
+
+  // Initialize the divisor
+  memset(__V, 0, (n) * sizeof(uint32_t));
+  __V[0] = RHS & mask;
+  __V[1] = (RHS) >> (sizeof(uint32_t) * 8);
+
+  // initialize the quotient and remainder
+  memset(__Q, 0, (m + n) * sizeof(uint32_t));
+  if (Remainder) memset(__R, 0, n * sizeof(uint32_t));
+
+  // Now, adjust m and n for the Knuth division. n is the number of words in
+  // the divisor. m is the number of words by which the dividend exceeds the
+  // divisor (i.e. m+n is the length of the dividend). These sizes must not
+  // contain any zero words or the Knuth algorithm fails.
+  for (unsigned i = n; i > 0 && __V[i - 1] == 0; i--) {
+    n--;
+    m++;
+  }
+  for (unsigned i = m + n; i > 0 && __U[i - 1] == 0; i--) m--;
+
+  // If we're left with only a single word for the divisor, Knuth doesn't work
+  // so we implement the short division algorithm here. This is much simpler
+  // and faster because we are certain that we can divide a 64-bit quantity
+  // by a 32-bit quantity at hardware speed and short division is simply a
+  // series of such operations. This is just like doing short division but we
+  // are using base 2^32 instead of base 10.
+  assert(n != 0 && "Divide by zero?");
+  if (n == 1) {
+    uint32_t divisor = __V[0];
+    uint32_t remainder = 0;
+    for (int i = m + n - 1; i >= 0; i--) {
+      uint64_t partial_dividend = (uint64_t(remainder)) << 32 | __U[i];
+      if (partial_dividend == 0) {
+        __Q[i] = 0;
+        remainder = 0;
+      } else if (partial_dividend < divisor) {
+        __Q[i] = 0;
+        remainder = partial_dividend;
+      } else if (partial_dividend == divisor) {
+        __Q[i] = 1;
+        remainder = 0;
+      } else {
+        __Q[i] = partial_dividend / divisor;
+        remainder = partial_dividend - (__Q[i] * divisor);
+      }
+    }
+    if (__R) __R[0] = remainder;
+  } else {
+    // Now we're ready to invoke the Knuth classical divide algorithm. In this
+    // case n > 1.
+    KnuthDiv(__U, __V, __Q, __R, m, n);
+  }
+
+  // If the caller wants the quotient
+  if (Quotient) {
+    // Set up the Quotient value's memory.
+    if (Quotient->BitWidth != LHS.BitWidth) {
+      if (Quotient->isSingleWord()) Quotient->set_VAL(0);
+    } else
+      Quotient->clear();
+
+    // The quotient is in Q. Reconstitute the quotient into Quotient's low
+    // order words.
+    if (lhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__Q[0]) | ((uint64_t(__Q[1])) << (APINT_BITS_PER_WORD / 2));
+      Quotient->set_VAL(tmp);
+    } else {
+      assert(!Quotient->isSingleWord() &&
+             "Quotient ap_private not large enough");
+      for (unsigned i = 0; i < lhsWords; ++i)
+        Quotient->set_pVal(
+            i, uint64_t(__Q[i * 2]) |
+                   ((uint64_t(__Q[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Quotient->clearUnusedBits();
+  }
+
+  // If the caller wants the remainder
+  if (Remainder) {
+    // Set up the Remainder value's memory.
+    if (Remainder->BitWidth != 64 /* RHS.BitWidth */) {
+      if (Remainder->isSingleWord()) Remainder->set_VAL(0);
+    } else
+      Remainder->clear();
+
+    // The remainder is in __R. Reconstitute the remainder into Remainder's low
+    // order words.
+    if (rhsWords == 1) {
+      uint64_t tmp =
+          uint64_t(__R[0]) | ((uint64_t(__R[1])) << (APINT_BITS_PER_WORD / 2));
+      Remainder->set_VAL(tmp);
+    } else {
+      assert(!Remainder->isSingleWord() &&
+             "Remainder ap_private not large enough");
+      for (unsigned i = 0; i < rhsWords; ++i)
+        Remainder->set_pVal(
+            i, uint64_t(__R[i * 2]) |
+                   ((uint64_t(__R[i * 2 + 1])) << (APINT_BITS_PER_WORD / 2)));
+    }
+    Remainder->clearUnusedBits();
+  }
+
+  // Clean up the memory we allocated.
+  if (__U != &SPACE[0]) {
+    delete[] __U;
+    delete[] __V;
+    delete[] __Q;
+    delete[] __R;
+  }
+}
+
+/// @brief Logical right-shift function.
+template <int _AP_W, bool _AP_S, bool _AP_C>
+INLINE ap_private<_AP_W, _AP_S, _AP_C> lshr(
+    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
+  return LHS.lshr(shiftAmt);
+}
+
+/// Left-shift the ap_private by shiftAmt.
+/// @brief Left-shift function.
+template <int _AP_W, bool _AP_S, bool _AP_C>
+INLINE ap_private<_AP_W, _AP_S, _AP_C> shl(
+    const ap_private<_AP_W, _AP_S, _AP_C>& LHS, uint32_t shiftAmt) {
+  return LHS.shl(shiftAmt);
+}
+
+} // namespace ap_private_ops
+
+#endif // LLVM_SUPPORT_MATHEXTRAS_H
+
+/// This enumeration just provides for internal constants used in this
+/// translation unit.
+enum {
+  MIN_INT_BITS = 1, ///< Minimum number of bits that can be specified
+  ///< Note that this must remain synchronized with IntegerType::MIN_INT_BITS
+  MAX_INT_BITS = (1 << 23) - 1 ///< Maximum number of bits that can be specified
+  ///< Note that this must remain synchronized with IntegerType::MAX_INT_BITS
+};
+
+//===----------------------------------------------------------------------===//
+//                              ap_private Class
+//===----------------------------------------------------------------------===//
+
+/// ap_private - This class represents arbitrary precision constant integral
+/// values.
+/// It is a functional replacement for common case unsigned integer type like
+/// "unsigned", "unsigned long" or "uint64_t", but also allows non-byte-width
+/// integer sizes and large integer value types such as 3-bits, 15-bits, or more
+/// than 64-bits of precision. ap_private provides a variety of arithmetic
+/// operators
+/// and methods to manipulate integer values of any bit-width. It supports both
+/// the typical integer arithmetic and comparison operations as well as bitwise
+/// manipulation.
+///
+/// The class has several invariants worth noting:
+///   * All bit, byte, and word positions are zero-based.
+///   * Once the bit width is set, it doesn't change except by the Truncate,
+///     SignExtend, or ZeroExtend operations.
+///   * All binary operators must be on ap_private instances of the same bit
+///   width.
+///     Attempting to use these operators on instances with different bit
+///     widths will yield an assertion.
+///   * The value is stored canonically as an unsigned value. For operations
+///     where it makes a difference, there are both signed and unsigned variants
+///     of the operation. For example, sdiv and udiv. However, because the bit
+///     widths must be the same, operations such as Mul and Add produce the same
+///     results regardless of whether the values are interpreted as signed or
+///     not.
+///   * In general, the class tries to follow the style of computation that LLVM
+///     uses in its IR. This simplifies its use for LLVM.
+///
+/// @brief Class for arbitrary precision integers.
+
+#if defined(_MSC_VER)
+#if _MSC_VER < 1400 && !defined(for)
+#define for if (0); else for
+#endif
+typedef unsigned __int64 ap_ulong;
+typedef signed __int64 ap_slong;
+#else
+typedef unsigned long long ap_ulong;
+typedef signed long long ap_slong;
+#endif
+template <int _AP_N8, bool _AP_S>
+struct valtype;
+
+template <int _AP_N8>
+struct valtype<_AP_N8, false> {
+  typedef uint64_t Type;
+};
+
+template <int _AP_N8>
+struct valtype<_AP_N8, true> {
+  typedef int64_t Type;
+};
+
+template <>
+struct valtype<1, false> {
+  typedef unsigned char Type;
+};
+template <>
+struct valtype<2, false> {
+  typedef unsigned short Type;
+};
+template <>
+struct valtype<3, false> {
+  typedef unsigned int Type;
+};
+template <>
+struct valtype<4, false> {
+  typedef unsigned int Type;
+};
+template <>
+struct valtype<1, true> {
+  typedef signed char Type;
+};
+template <>
+struct valtype<2, true> {
+  typedef short Type;
+};
+template <>
+struct valtype<3, true> {
+  typedef int Type;
+};
+template <>
+struct valtype<4, true> {
+  typedef int Type;
+};
+
+template <bool enable>
+struct ap_private_enable_if {};
+template <>
+struct ap_private_enable_if<true> {
+  static const bool isValid = true;
+};
+
+// When bitwidth < 64
+template <int _AP_W, bool _AP_S>
+class ap_private<_AP_W, _AP_S, true> {
+  // SFINAE pattern.  Only consider this class when _AP_W <= 64
+  const static bool valid = ap_private_enable_if<_AP_W <= 64>::isValid;
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+ public:
+  typedef typename valtype<(_AP_W + 7) / 8, _AP_S>::Type ValType;
+  typedef ap_private<_AP_W, _AP_S> Type;
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+    typedef ap_private<mult_w, mult_s> mult;
+    typedef ap_private<plus_w, plus_s> plus;
+    typedef ap_private<minus_w, minus_s> minus;
+    typedef ap_private<logic_w, logic_s> logic;
+    typedef ap_private<div_w, div_s> div;
+    typedef ap_private<mod_w, mod_s> mod;
+    typedef ap_private<_AP_W, _AP_S> arg1;
+    typedef bool reduce;
+  };
+  enum { APINT_BITS_PER_WORD = sizeof(uint64_t) * 8 };
+  enum {
+    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
+                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
+                      : 0
+  };
+  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
+  static const uint64_t not_mask = ~mask;
+  static const uint64_t sign_bit_mask = 1ULL << (APINT_BITS_PER_WORD - 1);
+  template <int _AP_W1>
+  struct sign_ext_mask {
+    static const uint64_t mask = ~0ULL << _AP_W1;
+  };
+  static const int width = _AP_W;
+
+  enum {
+    BitWidth = _AP_W,
+    _AP_N = 1,
+  };
+  ValType VAL; ///< Used to store the <= 64 bits integer value.
+#ifdef AP_CANARY
+  ValType CANARY;
+  void check_canary() { assert(CANARY == (ValType)0xDEADBEEFDEADBEEF); }
+  void set_canary() { CANARY = (ValType)0xDEADBEEFDEADBEEF; }
+#else
+  void check_canary() {}
+  void set_canary() {}
+#endif
+
+  INLINE ValType& get_VAL(void) { return VAL; }
+  INLINE ValType get_VAL(void) const { return VAL; }
+  INLINE ValType get_VAL(void) const volatile { return VAL; }
+  INLINE void set_VAL(uint64_t value) { VAL = (ValType)value; }
+  INLINE ValType& get_pVal(int i) { return VAL; }
+  INLINE ValType get_pVal(int i) const { return VAL; }
+  INLINE const uint64_t* get_pVal() const {
+    assert(0 && "invalid usage");
+    return 0;
+  }
+  INLINE ValType get_pVal(int i) const volatile { return VAL; }
+  INLINE uint64_t* get_pVal() const volatile {
+    assert(0 && "invalid usage");
+    return 0;
+  }
+  INLINE void set_pVal(int i, uint64_t value) { VAL = (ValType)value; }
+
+  INLINE uint32_t getBitWidth() const { return BitWidth; }
+
+  template <int _AP_W1, bool _AP_S1>
+  ap_private<_AP_W, _AP_S>& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  ap_private<_AP_W, _AP_S>& operator=(
+      const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(RHS.get_VAL()); // TODO check here about ap_private<W,S,false>
+    clearUnusedBits();
+    return *this;
+  }
+
+  void operator=(const ap_private& RHS) volatile {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+  }
+
+  ap_private& operator=(const ap_private& RHS) {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+    return *this;
+  }
+
+  void operator=(const volatile ap_private& RHS) volatile {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+  }
+
+  ap_private& operator=(const volatile ap_private& RHS) {
+    // Don't do anything for X = X
+    VAL = RHS.get_VAL(); // No need to check because no harm done by copying.
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    *this = ap_private<_AP_W2, false>(op2);
+    return *this;
+  }
+
+#define ASSIGN_OP_FROM_INT(C_TYPE)               \
+  INLINE ap_private& operator=(const C_TYPE v) { \
+    set_canary();                                \
+    this->VAL = (ValType)v;                      \
+    clearUnusedBits();                           \
+    check_canary();                              \
+    return *this;                                \
+  }
+
+ASSIGN_OP_FROM_INT(bool)
+ASSIGN_OP_FROM_INT(char)
+ASSIGN_OP_FROM_INT(signed char)
+ASSIGN_OP_FROM_INT(unsigned char)
+ASSIGN_OP_FROM_INT(short)
+ASSIGN_OP_FROM_INT(unsigned short)
+ASSIGN_OP_FROM_INT(int)
+ASSIGN_OP_FROM_INT(unsigned int)
+ASSIGN_OP_FROM_INT(long)
+ASSIGN_OP_FROM_INT(unsigned long)
+ASSIGN_OP_FROM_INT(ap_slong)
+ASSIGN_OP_FROM_INT(ap_ulong)
+#if 0
+ASSIGN_OP_FROM_INT(half)
+ASSIGN_OP_FROM_INT(float)
+ASSIGN_OP_FROM_INT(double)
+#endif
+#undef ASSIGN_OP_FROM_INT
+
+  // XXX This is a must to prevent pointer being converted to bool.
+  INLINE ap_private& operator=(const char* s) {
+    ap_private tmp(s); // XXX direct-initialization, as ctor is explicit.
+    operator=(tmp);
+    return *this;
+  }
+
+ private:
+  explicit INLINE ap_private(uint64_t* val) : VAL(val[0]) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  INLINE bool isSingleWord() const { return true; }
+
+ public:
+  INLINE void fromString(const char* strStart, uint32_t slen, uint8_t radix) {
+    bool isNeg = strStart[0] == '-';
+    if (isNeg) {
+      strStart++;
+      slen--;
+    }
+
+    if (strStart[0] == '0' && (strStart[1] == 'b' || strStart[1] == 'B')) {
+      //if(radix == 0) radix = 2;
+      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", strStart, 2, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'o' || strStart[1] == 'O')) {
+      //if (radix == 0) radix = 8;
+      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", strStart, 8, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'x' || strStart[1] == 'X')) {
+      //if (radix == 0) radix = 16;
+      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", strStart, 16, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (strStart[0] == '0' && (strStart[1] == 'd' || strStart[1] == 'D')) {
+      //if (radix == 0) radix = 10;
+      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", strStart, 10, radix);
+      strStart += 2;
+      slen -=2;
+    } else if (radix == 0) {
+      //radix = 2; // XXX default value
+    }
+
+    // Check our assumptions here
+    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+           "Radix should be 2, 8, 10, or 16!");
+    assert(strStart && "String is null?");
+
+    // Clear bits.
+    uint64_t tmpVAL = VAL = 0;
+
+    switch (radix) {
+      case 2:
+        //        sscanf(strStart,"%b",&VAL);
+        // tmpVAL = *strStart =='1' ? ~0ULL : 0;
+        for (; *strStart; ++strStart) {
+          assert((*strStart == '0' || *strStart == '1') &&
+                 ("Wrong binary number"));
+          tmpVAL <<= 1;
+          tmpVAL |= (*strStart - '0');
+        }
+        break;
+      case 8:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llo", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lo", &tmpVAL);
+#else
+        sscanf(strStart, "%llo", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      case 10:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llu", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lu", &tmpVAL);
+#else
+        sscanf(strStart, "%llu", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      case 16:
+#ifdef _MSC_VER
+        sscanf_s(strStart, "%llx", &tmpVAL, slen + 1);
+#else
+#if defined(__x86_64__) && !defined(__MINGW32__) && !defined(__WIN32__)
+        sscanf(strStart, "%lx", &tmpVAL);
+#else
+        sscanf(strStart, "%llx", &tmpVAL);
+#endif //__x86_64__
+#endif //_MSC_VER
+        break;
+      default:
+        assert(true && "Unknown radix");
+        // error
+    }
+    VAL = isNeg ? (ValType)(-tmpVAL) : (ValType)(tmpVAL);
+
+    clearUnusedBits();
+  }
+
+ private:
+  INLINE ap_private(const std::string& val, uint8_t radix = 2) : VAL(0) {
+    assert(!val.empty() && "String empty?");
+    set_canary();
+    fromString(val.c_str(), val.size(), radix);
+    check_canary();
+  }
+
+  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix)
+      : VAL(0) {
+    set_canary();
+    fromString(strStart, slen, radix);
+    check_canary();
+  }
+
+  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[])
+      : VAL(bigVal[0]) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+ public:
+  INLINE ap_private() {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+#define CTOR(TYPE)                              \
+  INLINE ap_private(TYPE v) : VAL((ValType)v) { \
+    set_canary();                               \
+    clearUnusedBits();                          \
+    check_canary();                             \
+  }
+  CTOR(bool)
+  CTOR(char)
+  CTOR(signed char)
+  CTOR(unsigned char)
+  CTOR(short)
+  CTOR(unsigned short)
+  CTOR(int)
+  CTOR(unsigned int)
+  CTOR(long)
+  CTOR(unsigned long)
+  CTOR(ap_slong)
+  CTOR(ap_ulong)
+#if 0
+  CTOR(half)
+  CTOR(float)
+  CTOR(double)
+#endif
+#undef CTOR
+
+  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
+      : VAL((ValType)that.get_VAL()) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1, bool _AP_OPT>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, _AP_OPT>& that)
+      : VAL((ValType)that.get_VAL()) {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  explicit INLINE ap_private(const char* val) {
+    set_canary();
+    unsigned char radix = 10;
+    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
+    std::string::size_type pos = str.find('.');
+    // trunc all fraction part
+    if (pos != std::string::npos) str = str.substr(pos);
+
+    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
+    operator=(ap_private_val);
+    check_canary();
+  }
+
+  INLINE ap_private(const char* val, signed char rd) {
+    set_canary();
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(val, radix); // will set radix.
+    std::string::size_type pos = str.find('.');
+    // trunc all fraction part
+    if (pos != std::string::npos) str = str.substr(pos);
+
+    ap_private<_AP_W, _AP_S> ap_private_val(str, radix);
+    operator=(ap_private_val);
+    check_canary();
+  }
+
+  INLINE ~ap_private() { check_canary(); }
+
+  INLINE bool isNegative() const {
+    static const uint64_t sign_mask = 1ULL << (_AP_W - 1);
+    return _AP_S && (sign_mask & VAL);
+  }
+
+  INLINE bool isPositive() const { return !isNegative(); }
+
+  INLINE bool isStrictlyPositive() const { return !isNegative() && VAL != 0; }
+
+  INLINE bool isAllOnesValue() const { return (mask & VAL) == mask; }
+
+  INLINE bool operator==(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return VAL == RHS.get_VAL();
+  }
+  INLINE bool operator==(const ap_private<_AP_W, !_AP_S>& RHS) const {
+    return (uint64_t)VAL == (uint64_t)RHS.get_VAL();
+  }
+
+  INLINE bool operator==(uint64_t Val) const { return ((uint64_t)VAL == Val); }
+  INLINE bool operator!=(uint64_t Val) const { return ((uint64_t)VAL != Val); }
+  INLINE bool operator!=(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return VAL != RHS.get_VAL();
+  }
+  INLINE bool operator!=(const ap_private<_AP_W, !_AP_S>& RHS) const {
+    return (uint64_t)VAL != (uint64_t)RHS.get_VAL();
+  }
+
+  /// postfix increment.
+  const ap_private operator++(int) {
+    ap_private orig(*this);
+    VAL++;
+    clearUnusedBits();
+    return orig;
+  }
+
+  /// prefix increment.
+  const ap_private operator++() {
+    ++VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// postfix decrement.
+  const ap_private operator--(int) {
+    ap_private orig(*this);
+    --VAL;
+    clearUnusedBits();
+    return orig;
+  }
+
+  /// prefix decrement.
+  const ap_private operator--() {
+    --VAL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// one's complement.
+  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
+    ap_private<_AP_W + !_AP_S, true> Result(*this);
+    Result.flip();
+    return Result;
+  }
+
+  /// two's complement.
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_private<1, false>(0) - (*this);
+  }
+
+  /// logic negation.
+  INLINE bool operator!() const { return !VAL; }
+
+  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
+  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
+    return toString(radix, false);
+  }
+  INLINE std::string toStringSigned(uint8_t radix = 10) const {
+    return toString(radix, true);
+  }
+  INLINE void clear() { VAL = 0; }
+  INLINE ap_private& clear(uint32_t bitPosition) {
+    VAL &= ~(1ULL << (bitPosition));
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE ap_private ashr(uint32_t shiftAmt) const {
+    if (_AP_S)
+      return ap_private((shiftAmt == BitWidth) ? 0
+                                               : ((int64_t)VAL) >> (shiftAmt));
+    else
+      return ap_private((shiftAmt == BitWidth) ? 0
+                                               : ((uint64_t)VAL) >> (shiftAmt));
+  }
+
+  INLINE ap_private lshr(uint32_t shiftAmt) const {
+    return ap_private((shiftAmt == BitWidth)
+                          ? ap_private(0)
+                          : ap_private((VAL & mask) >> (shiftAmt)));
+  }
+
+  INLINE ap_private shl(uint32_t shiftAmt) const
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    if (shiftAmt > BitWidth) {
+      if (!isNegative())
+        return ap_private(0);
+      else
+        return ap_private(-1);
+    }
+    if (shiftAmt == BitWidth)
+      return ap_private(0);
+    else
+      return ap_private((VAL) << (shiftAmt));
+    // return ap_private((shiftAmt == BitWidth) ? ap_private(0ULL) :
+    // ap_private(VAL << shiftAmt));
+  }
+
+  INLINE int64_t getSExtValue() const { return VAL; }
+
+  // XXX XXX this function is used in CBE
+  INLINE uint64_t getZExtValue() const { return VAL & mask; }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ref.get();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ((uint64_t)(bool)ref);
+    check_canary();
+  }
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+//    set_canary();
+//    *this = ref.get();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = ((val.operator ap_private<_AP_W2, false>()));
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = (uint64_t)(bool)val;
+//    check_canary();
+//  }
+
+  INLINE void write(const ap_private<_AP_W, _AP_S>& op2) volatile {
+    *this = (op2);
+  }
+
+  // Explicit conversions to C interger types
+  //-----------------------------------------------------------
+  INLINE operator ValType() const { return get_VAL(); }
+
+  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
+
+  INLINE int to_char() const { return (signed char)get_VAL(); }
+
+  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
+
+  INLINE int to_short() const { return (short)get_VAL(); }
+
+  INLINE int to_int() const {
+    //      ap_private<64 /* _AP_W */, _AP_S> res(V);
+    return (int)get_VAL();
+  }
+
+  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
+
+  INLINE long to_long() const { return (long)get_VAL(); }
+
+  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
+
+  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
+
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
+
+  INLINE double to_double() const {
+    if (isNegative())
+      return roundToDouble(true);
+    else
+      return roundToDouble(false);
+  }
+
+  INLINE unsigned length() const { return _AP_W; }
+
+  INLINE bool isMinValue() const { return VAL == 0; }
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator&=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) & RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator|=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) | RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator^=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) ^ RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) * RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) + RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    VAL = (ValType)(((uint64_t)VAL) - RHS.get_VAL());
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator&(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) &
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret & RHS;
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator^(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) ^
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret ^ RHS;
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator|(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::logic_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::logic Ret(((uint64_t)VAL) |
+                                                RHS.get_VAL());
+      return Ret;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::logic Ret = *this;
+      return Ret | RHS;
+    }
+  }
+
+  INLINE ap_private And(const ap_private& RHS) const {
+    return ap_private(VAL & RHS.get_VAL());
+  }
+
+  INLINE ap_private Or(const ap_private& RHS) const {
+    return ap_private(VAL | RHS.get_VAL());
+  }
+
+  INLINE ap_private Xor(const ap_private& RHS) const {
+    return ap_private(VAL ^ RHS.get_VAL());
+  }
+#if 1
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::mult_w <= 64) {
+      typename RType<_AP_W1, _AP_S1>::mult Result(((uint64_t)VAL) *
+                                                  RHS.get_VAL());
+      return Result;
+    } else {
+      typename RType<_AP_W1, _AP_S1>::mult Result(*this);
+      Result *= RHS;
+      return Result;
+    }
+  }
+#endif
+  INLINE ap_private Mul(const ap_private& RHS) const {
+    return ap_private(VAL * RHS.get_VAL());
+  }
+
+  INLINE ap_private Add(const ap_private& RHS) const {
+    return ap_private(VAL + RHS.get_VAL());
+  }
+
+  INLINE ap_private Sub(const ap_private& RHS) const {
+    return ap_private(VAL - RHS.get_VAL());
+  }
+
+  INLINE ap_private& operator&=(uint64_t RHS) {
+    VAL &= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator|=(uint64_t RHS) {
+    VAL |= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator^=(uint64_t RHS) {
+    VAL ^= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator*=(uint64_t RHS) {
+    VAL *= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator+=(uint64_t RHS) {
+    VAL += (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator-=(uint64_t RHS) {
+    VAL -= (ValType)RHS;
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE bool isMinSignedValue() const {
+    static const uint64_t min_mask = ~(~0ULL << (_AP_W - 1));
+    return BitWidth == 1 ? VAL == 1
+                         : (ap_private_ops::isNegative<_AP_W>(*this) &&
+                            ((min_mask & VAL) == 0));
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::plus_w <= 64)
+      return typename RType<_AP_W1, _AP_S1>::plus(
+          RType<_AP_W1, _AP_S1>::plus_s
+              ? int64_t(((uint64_t)VAL) + RHS.get_VAL())
+              : uint64_t(((uint64_t)VAL) + RHS.get_VAL()));
+    typename RType<_AP_W1, _AP_S1>::plus Result = RHS;
+    Result += VAL;
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (RType<_AP_W1, _AP_S1>::minus_w <= 64)
+      return typename RType<_AP_W1, _AP_S1>::minus(
+          int64_t(((uint64_t)VAL) - RHS.get_VAL()));
+    typename RType<_AP_W1, _AP_S1>::minus Result = *this;
+    Result -= RHS;
+    return Result;
+  }
+
+  INLINE uint32_t countPopulation() const {
+    return ap_private_ops::CountPopulation_64(VAL);
+  }
+  INLINE uint32_t countLeadingZeros() const {
+    int remainder = BitWidth % 64;
+    int excessBits = (64 - remainder) % 64;
+    uint32_t Count = ap_private_ops::CountLeadingZeros_64(VAL);
+    if (Count) Count -= excessBits;
+    return AESL_std::min(Count, (uint32_t)_AP_W);
+  }
+
+  /// HiBits - This function returns the high "numBits" bits of this ap_private.
+  INLINE ap_private<_AP_W, _AP_S> getHiBits(uint32_t numBits) const {
+    ap_private<_AP_W, _AP_S> ret(*this);
+    ret = (ret) >> (BitWidth - numBits);
+    return ret;
+  }
+
+  /// LoBits - This function returns the low "numBits" bits of this ap_private.
+  INLINE ap_private<_AP_W, _AP_S> getLoBits(uint32_t numBits) const {
+    ap_private<_AP_W, _AP_S> ret(((uint64_t)VAL) << (BitWidth - numBits));
+    ret = (ret) >> (BitWidth - numBits);
+    return ret;
+    // return ap_private(numBits, (VAL << (BitWidth - numBits))>> (BitWidth -
+    // numBits));
+  }
+
+  INLINE ap_private<_AP_W, _AP_S>& set(uint32_t bitPosition) {
+    VAL |= (1ULL << (bitPosition));
+    clearUnusedBits();
+    return *this; // clearUnusedBits();
+  }
+
+  INLINE void set() {
+    VAL = (ValType)~0ULL;
+    clearUnusedBits();
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_private<_AP_W3, false>& val) {
+    operator=(ap_private<_AP_W3, _AP_S>(val));
+  }
+
+  INLINE void set(const ap_private& val) { operator=(val); }
+
+  INLINE void clearUnusedBits(void) volatile
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
+    VAL = (ValType)(
+        _AP_S
+            ? ((((int64_t)VAL) << (excess_bits)) >> (excess_bits))
+            : (excess_bits ? (((uint64_t)VAL) << (excess_bits)) >> (excess_bits)
+                           : (uint64_t)VAL));
+  }
+
+  INLINE void clearUnusedBitsToZero(void) {
+    enum { excess_bits = (_AP_W % 64) ? 64 - _AP_W % 64 : 0 };
+    static uint64_t mask = ~0ULL >> (excess_bits);
+    VAL &= mask;
+  }
+
+  INLINE ap_private udiv(const ap_private& RHS) const {
+    return ap_private((uint64_t)VAL / RHS.get_VAL());
+  }
+
+  /// Signed divide this ap_private by ap_private RHS.
+  /// @brief Signed division function for ap_private.
+  INLINE ap_private sdiv(const ap_private& RHS) const {
+    if (isNegative())
+      if (RHS.isNegative())
+        return ((uint64_t)(0 - (*this))) / (uint64_t)(0 - RHS);
+      else
+        return 0 - ((uint64_t)(0 - (*this)) / (uint64_t)(RHS));
+    else if (RHS.isNegative())
+      return 0 - (this->udiv((ap_private)(0 - RHS)));
+    return this->udiv(RHS);
+  }
+
+  template <bool _AP_S2>
+  INLINE ap_private urem(const ap_private<_AP_W, _AP_S2>& RHS) const {
+    assert(RHS.get_VAL() != 0 && "Divide by 0");
+    return ap_private(((uint64_t)VAL) % ((uint64_t)RHS.get_VAL()));
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  template <bool _AP_S2>
+  INLINE ap_private srem(const ap_private<_AP_W, _AP_S2>& RHS) const {
+    if (isNegative()) {
+      ap_private lhs = 0 - (*this);
+      if (RHS.isNegative()) {
+        ap_private rhs = 0 - RHS;
+        return 0 - (lhs.urem(rhs));
+      } else
+        return 0 - (lhs.urem(RHS));
+    } else if (RHS.isNegative()) {
+      ap_private rhs = 0 - RHS;
+      return this->urem(rhs);
+    }
+    return this->urem(RHS);
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool eq(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return (*this) == RHS;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ne(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered unsigned.
+  /// @brief Unsigned less than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ult(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    if (_AP_W1 <= 64) {
+      uint64_t lhsZext = ((uint64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
+      uint64_t rhsZext =
+          ((uint64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
+      return lhsZext < rhsZext;
+    } else
+      return RHS.uge(*this);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered signed.
+  /// @brief Signed less than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool slt(const ap_private<_AP_W1, _AP_S1>& RHS) const
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    if (_AP_W1 <= 64) {
+      int64_t lhsSext = ((int64_t(VAL)) << (64 - _AP_W)) >> (64 - _AP_W);
+      int64_t rhsSext =
+          ((int64_t(RHS.get_VAL())) << (64 - _AP_W1)) >> (64 - _AP_W1);
+      return lhsSext < rhsSext;
+    } else
+      return RHS.sge(*this);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered unsigned.
+  /// @brief Unsigned less or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ule(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return ult(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered signed.
+  /// @brief Signed less or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sle(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return slt(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered unsigned.
+  /// @brief Unsigned greather than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool ugt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !ult(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered signed.
+  /// @brief Signed greather than comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sgt(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !slt(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered unsigned.
+  /// @brief Unsigned greater or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool uge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !ult(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered signed.
+  /// @brief Signed greather or equal comparison
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool sge(const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    return !slt(RHS);
+  }
+
+  INLINE ap_private abs() const {
+    if (isNegative()) return -(*this);
+    return *this;
+  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> ret(*this);
+    return ret;
+  }
+
+  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
+                                       uint8_t radix) {
+    return _AP_W;
+  }
+
+  INLINE uint32_t getActiveBits() const {
+    uint32_t bits = _AP_W - countLeadingZeros();
+    return bits ? bits : 1;
+  }
+
+  INLINE double roundToDouble(bool isSigned = false) const {
+    return isSigned ? double((int64_t)VAL) : double((uint64_t)VAL);
+  }
+
+  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
+   * versa*/
+  INLINE ap_private& reverse() {
+    for (int i = 0; i < _AP_W / 2; ++i) {
+      bool tmp = operator[](i);
+      if (operator[](_AP_W - 1 - i))
+        set(i);
+      else
+        clear(i);
+      if (tmp)
+        set(_AP_W - 1 - i);
+      else
+        clear(_AP_W - 1 - i);
+    }
+    clearUnusedBits();
+    return *this;
+  }
+
+  /*Return true if the value of ap_private instance is zero*/
+  INLINE bool iszero() const { return isMinValue(); }
+
+  INLINE bool to_bool() const { return !iszero(); }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (isNegative()) return true;
+    return false;
+  }
+
+  /* x[i] = !x[i] */
+  INLINE void invert(int i) {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    flip(i);
+  }
+
+  /* x[i] */
+  INLINE bool test(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return operator[](i);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the left
+  INLINE void lrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(shl(n) | lshr(_AP_W - n));
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the right
+  INLINE void rrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(lshr(n) | shl(_AP_W - n));
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return (((1ULL << i) & VAL) != 0);
+  }
+
+  /// Toggle all bits.
+  INLINE ap_private& flip() {
+    VAL = (ValType)((~0ULL ^ VAL) & mask);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// Toggles a given bit to its opposite value.
+  INLINE ap_private& flip(uint32_t bitPosition) {
+    assert(bitPosition < BitWidth && "Out of the bit-width range!");
+    set_bit(bitPosition, !get_bit(bitPosition));
+    return *this;
+  }
+
+  // complements every bit
+  INLINE void b_not() { flip(); }
+
+// Binary Arithmetic
+//-----------------------------------------------------------
+#define OP_BIN_AP(Sym, Rty, Fun)                           \
+  template <int _AP_W2, bool _AP_S2>                       \
+  INLINE typename RType<_AP_W2, _AP_S2>::Rty operator Sym( \
+      const ap_private<_AP_W2, _AP_S2>& op) const {        \
+    typename RType<_AP_W2, _AP_S2>::Rty lhs(*this);        \
+    typename RType<_AP_W2, _AP_S2>::Rty rhs(op);           \
+    return lhs.Fun(rhs);                                   \
+  }
+
+/// Bitwise and, or, xor
+// OP_BIN_AP(&,logic, And)
+// OP_BIN_AP(|,logic, Or)
+// OP_BIN_AP(^,logic, Xor)
+#undef OP_BIN_AP
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    return typename RType<_AP_W2, _AP_S2>::div(
+        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    typename RType<_AP_W2, _AP_S2>::mod res =
+        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
+                                                  : lhs.urem(rhs));
+    return res;
+  }
+
+#define OP_ASSIGN_AP_2(Sym)                         \
+  template <int _AP_W2, bool _AP_S2>                \
+  INLINE ap_private<_AP_W, _AP_S>& operator Sym##=( \
+      const ap_private<_AP_W2, _AP_S2>& op) {       \
+    *this = operator Sym(op);                       \
+    return *this;                                   \
+  }
+
+  OP_ASSIGN_AP_2(/)
+  OP_ASSIGN_AP_2(%)
+#undef OP_ASSIGN_AP_2
+
+/// Bitwise assign: and, or, xor
+//-------------------------------------------------------------
+//    OP_ASSIGN_AP(&)
+//    OP_ASSIGN_AP(^)
+//    OP_ASSIGN_AP(|)
+
+#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
+  INLINE ap_private operator<<(const TYPE op) const { \
+    if (op >= _AP_W) return ap_private(0);            \
+    if (SIGNED && op < 0) return *this >> (0 - op);   \
+    return shl(op);                                   \
+  }
+
+  // OP_LEFT_SHIFT_CTYPE(bool, false)
+  OP_LEFT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_LEFT_SHIFT_CTYPE(signed char, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
+  OP_LEFT_SHIFT_CTYPE(short, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
+  OP_LEFT_SHIFT_CTYPE(int, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
+  OP_LEFT_SHIFT_CTYPE(long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
+  OP_LEFT_SHIFT_CTYPE(long long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
+#if 0
+  OP_LEFT_SHIFT_CTYPE(half, false)
+  OP_LEFT_SHIFT_CTYPE(float, false)
+  OP_LEFT_SHIFT_CTYPE(double, false)
+#endif
+
+#undef OP_LEFT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this << sh;
+    } else {
+      int sh = op2.to_int();
+      return *this << sh;
+    }
+  }
+
+#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
+  INLINE ap_private operator>>(const TYPE op) const { \
+    if (op >= _AP_W) {                                \
+      if (isNegative())                               \
+        return ap_private(-1);                        \
+      else                                            \
+        return ap_private(0);                         \
+    }                                                 \
+    if ((SIGNED) && op < 0) return *this << (0 - op); \
+    if (_AP_S)                                        \
+      return ashr(op);                                \
+    else                                              \
+      return lshr(op);                                \
+  }
+
+  // OP_RIGHT_SHIFT_CTYPE(bool, false)
+  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_RIGHT_SHIFT_CTYPE(signed char, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
+  OP_RIGHT_SHIFT_CTYPE(short, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
+  OP_RIGHT_SHIFT_CTYPE(int, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
+  OP_RIGHT_SHIFT_CTYPE(long, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
+  OP_RIGHT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_RIGHT_SHIFT_CTYPE(half, false)
+  OP_RIGHT_SHIFT_CTYPE(float, false)
+  OP_RIGHT_SHIFT_CTYPE(double, false)
+#endif
+
+#undef OP_RIGHT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this >> sh;
+    } else {
+      int sh = op2.to_int();
+      return *this >> sh;
+    }
+  }
+
+  /// Shift assign
+  //-----------------------------------------------------------------
+
+  //INLINE const ap_private& operator<<=(uint32_t shiftAmt) {
+  //  VAL <<= shiftAmt;
+  //  clearUnusedBits();
+  //  return *this;
+  //}
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(int op) {                               \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }                                                                          \
+  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }                                                                          \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    clearUnusedBits();                                                       \
+    return *this;                                                            \
+  }
+
+  OP_ASSIGN_AP(>>)
+  OP_ASSIGN_AP(<<)
+#undef OP_ASSIGN_AP
+
+  /// Comparisons
+  //-----------------------------------------------------------------
+  template <int _AP_W1, bool _AP_S1>
+  INLINE bool operator==(const ap_private<_AP_W1, _AP_S1>& op) const {
+    enum { _AP_MAX_W = AP_MAX(AP_MAX(_AP_W, _AP_W1), 32) };
+    ap_private<_AP_MAX_W, false> lhs(*this);
+    ap_private<_AP_MAX_W, false> rhs(op);
+    if (_AP_MAX_W <= 64) {
+      return (uint64_t)lhs.get_VAL() == (uint64_t)rhs.get_VAL();
+    } else
+      return lhs == rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this == op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    // this will follow gcc rule for comparison
+    // between different bitwidth and signness
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
+    else if (_AP_W < 32 && _AP_W2 < 32)
+      // different signness but both bitwidth is less than 32
+      return lhs.sgt(rhs);
+    else
+        // different signness but bigger bitwidth
+        // is greater or equal to 32
+        if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ugt(rhs);
+      else
+        return lhs.sgt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ugt(rhs);
+    else
+      return lhs.sgt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this > op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
+    else if (_AP_W < 32 && _AP_W2 < 32)
+      return lhs.slt(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ult(rhs);
+      else
+        return lhs.slt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ult(rhs);
+    else
+      return lhs.slt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this < op);
+  }
+
+  /// Bit and Part Select
+  //--------------------------------------------------------------
+  // FIXME now _private_range_ref refs to _AP_ROOT_TYPE(struct ssdm_int).
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+//                                                                         a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+//                                                                       a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(
+//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this & a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this | a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this ^ a2.get();
+//  }
+
+  // Reduce operation
+  //-----------------------------------------------------------
+  INLINE bool and_reduce() const { return (VAL & mask) == mask; }
+
+  INLINE bool nand_reduce() const { return (VAL & mask) != mask; }
+
+  INLINE bool or_reduce() const { return (bool)VAL; }
+
+  INLINE bool nor_reduce() const { return VAL == 0; }
+
+  INLINE bool xor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? true : false;
+  }
+
+  INLINE bool xnor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? false : true;
+  }
+
+  INLINE std::string to_string(uint8_t radix = 2, bool sign = false) const {
+    return toString(radix, radix == 10 ? _AP_S : sign);
+  }
+}; // End of class ap_private <_AP_W, _AP_S, true>
+
+template <int _AP_W, bool _AP_S>
+std::string ap_private<_AP_W, _AP_S, true>::toString(uint8_t radix,
+                                                     bool wantSigned) const {
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "a", "b", "c", "d", "e", "f"};
+  std::string result;
+  if (radix != 10) {
+    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
+    // because the number of bits per digit (1,3 and 4 respectively) divides
+    // equaly. We just shift until there value is zero.
+
+    // First, check for a zero value and just short circuit the logic below.
+    if (*this == (uint64_t)(0)) {
+      // Always generate a radix indicator because fixed-point
+      // formats require it.
+      switch (radix) {
+        case 2:
+          result = "0b0";
+          break;
+        case 8:
+          result = "0o0";
+          break;
+        case 16:
+          result = "0x0";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+    } else {
+      ap_private<_AP_W, false, true> tmp(*this);
+      size_t insert_at = 0;
+      bool leading_zero = true;
+      if (wantSigned && isNegative()) {
+        // They want to print the signed version and it is a negative value
+        // Flip the bits and add one to turn it into the equivalent positive
+        // value and put a '-' in the result.
+        tmp.flip();
+        tmp++;
+        result = "-";
+        insert_at = 1;
+        leading_zero = false;
+      }
+      switch (radix) {
+        case 2:
+          result += "0b";
+          break;
+        case 8:
+          result += "0o";
+          break;
+        case 16:
+          result += "0x";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+      insert_at += 2;
+
+      // Just shift tmp right for each digit width until it becomes zero
+      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
+      uint64_t mask = radix - 1;
+      ap_private<_AP_W, false, true> zero(0);
+      unsigned bits = 0;
+      bool msb = false;
+      while (tmp.ne(zero)) {
+        unsigned digit = (unsigned)(tmp.get_VAL() & mask);
+        result.insert(insert_at, digits[digit]);
+        tmp = tmp.lshr(shift);
+        bits++;
+        msb = (digit >> (shift - 1)) == 1;
+      }
+      bits *= shift;
+      if (bits < _AP_W && leading_zero && msb)
+        result.insert(insert_at, digits[0]);
+    }
+    return result;
+  }
+
+  ap_private<_AP_W, false, true> tmp(*this);
+  ap_private<6, false, true> divisor(radix);
+  ap_private<_AP_W, _AP_S, true> zero(0);
+  size_t insert_at = 0;
+  if (wantSigned && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    tmp.flip();
+    tmp++;
+    result = "-";
+    insert_at = 1;
+  }
+  if (tmp == ap_private<_AP_W, false, true>(0ULL))
+    result = "0";
+  else
+    while (tmp.ne(zero)) {
+      ap_private<_AP_W, false, true> APdigit = tmp % divisor;
+      ap_private<_AP_W, false, true> tmp2 = tmp / divisor;
+      uint32_t digit = (uint32_t)(APdigit.getZExtValue());
+      assert(digit < radix && "divide failed");
+      result.insert(insert_at, digits[digit]);
+      tmp = tmp2;
+    }
+  return result;
+
+} // End of ap_private<_AP_W, _AP_S, true>::toString()
+
+// bitwidth > 64
+template <int _AP_W, bool _AP_S>
+class ap_private<_AP_W, _AP_S, false> {
+  // SFINAE pattern.  Only consider this class when _AP_W > 64
+  const static bool valid = ap_private_enable_if<(_AP_W > 64)>::isValid;
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+ public:
+  enum { BitWidth = _AP_W, _AP_N = (_AP_W + 63) / 64 };
+  static const int width = _AP_W;
+
+ private:
+  /// This constructor is used only internally for speed of construction of
+  /// temporaries. It is unsafe for general use so it is not public.
+
+  /* Constructors */
+  /// Note that numWords can be smaller or larger than the corresponding bit
+  /// width but any extraneous bits will be dropped.
+  /// @param numWords the number of words in bigVal
+  /// @param bigVal a sequence of words to form the initial value of the
+  /// ap_private
+  /// @brief Construct an ap_private, initialized as bigVal[].
+  INLINE ap_private(uint32_t numWords, const uint64_t bigVal[]) {
+    set_canary();
+    assert(bigVal && "Null pointer detected!");
+    {
+      // Get memory, cleared to 0
+      memset(pVal, 0, _AP_N * sizeof(uint64_t));
+
+      // Calculate the number of words to copy
+      uint32_t words = AESL_std::min<uint32_t>(numWords, _AP_N);
+      // Copy the words from bigVal to pVal
+      memcpy(pVal, bigVal, words * APINT_WORD_SIZE);
+      if (words >= _AP_W) clearUnusedBits();
+      // Make sure unused high bits are cleared
+    }
+    check_canary();
+  }
+
+  /// This constructor interprets Val as a string in the given radix. The
+  /// interpretation stops when the first charater that is not suitable for the
+  /// radix is encountered. Acceptable radix values are 2, 8, 10 and 16. It is
+  /// an error for the value implied by the string to require more bits than
+  /// numBits.
+  /// @param val the string to be interpreted
+  /// @param radix the radix of Val to use for the intepretation
+  /// @brief Construct an ap_private from a string representation.
+  INLINE ap_private(const std::string& val, uint8_t radix = 2) {
+    set_canary();
+    assert(!val.empty() && "The input string is empty.");
+    const char* c_str = val.c_str();
+    fromString(c_str, val.size(), radix);
+    check_canary();
+  }
+
+  /// This constructor interprets the slen characters starting at StrStart as
+  /// a string in the given radix. The interpretation stops when the first
+  /// character that is not suitable for the radix is encountered. Acceptable
+  /// radix values are 2, 8, 10 and 16. It is an error for the value implied by
+  /// the string to require more bits than numBits.
+  /// @param strStart the start of the string to be interpreted
+  /// @param slen the maximum number of characters to interpret
+  /// @param radix the radix to use for the conversion
+  /// @brief Construct an ap_private from a string representation.
+  /// This method does not consider whether it is negative or not.
+  INLINE ap_private(const char strStart[], uint32_t slen, uint8_t radix) {
+    set_canary();
+    fromString(strStart, slen, radix);
+    check_canary();
+  }
+
+  INLINE void report() {
+    _AP_ERROR(_AP_W > MAX_MODE(AP_INT_MAX_W) * 1024,
+              "ap_%sint<%d>: Bitwidth exceeds the "
+              "default max value %d. Please use macro "
+              "AP_INT_MAX_W to set a larger max value.",
+              _AP_S ? "" : "u", _AP_W, MAX_MODE(AP_INT_MAX_W) * 1024);
+  }
+  /// This union is used to store the integer value. When the
+  /// integer bit-width <= 64, it uses VAL, otherwise it uses pVal.
+
+  /// This enum is used to hold the constants we needed for ap_private.
+  // uint64_t VAL;    ///< Used to store the <= 64 bits integer value.
+  uint64_t pVal[_AP_N]; ///< Used to store the >64 bits integer value.
+#ifdef AP_CANARY
+  uint64_t CANARY;
+  INLINE void check_canary() { assert(CANARY == (uint64_t)0xDEADBEEFDEADBEEF); }
+  INLINE void set_canary() { CANARY = (uint64_t)0xDEADBEEFDEADBEEF; }
+#else
+  INLINE void check_canary() {}
+  INLINE void set_canary() {}
+#endif
+
+ public:
+  typedef typename valtype<8, _AP_S>::Type ValType;
+  typedef ap_private<_AP_W, _AP_S> Type;
+  // FIXME remove friend type?
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  friend struct ap_fixed_base;
+  /// return type of variety of operations
+  //----------------------------------------------------------
+  template <int _AP_W2, bool _AP_S2>
+  struct RType {
+    enum {
+      mult_w = _AP_W + _AP_W2,
+      mult_s = _AP_S || _AP_S2,
+      plus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      plus_s = _AP_S || _AP_S2,
+      minus_w =
+          AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)) + 1,
+      minus_s = true,
+      div_w = _AP_W + _AP_S2,
+      div_s = _AP_S || _AP_S2,
+      mod_w = AP_MIN(_AP_W, _AP_W2 + (!_AP_S2 && _AP_S)),
+      mod_s = _AP_S,
+      logic_w = AP_MAX(_AP_W + (_AP_S2 && !_AP_S), _AP_W2 + (_AP_S && !_AP_S2)),
+      logic_s = _AP_S || _AP_S2
+    };
+    typedef ap_private<mult_w, mult_s> mult;
+    typedef ap_private<plus_w, plus_s> plus;
+    typedef ap_private<minus_w, minus_s> minus;
+    typedef ap_private<logic_w, logic_s> logic;
+    typedef ap_private<div_w, div_s> div;
+    typedef ap_private<mod_w, mod_s> mod;
+    typedef ap_private<_AP_W, _AP_S> arg1;
+    typedef bool reduce;
+  };
+
+  INLINE uint64_t& get_VAL(void) { return pVal[0]; }
+  INLINE uint64_t get_VAL(void) const { return pVal[0]; }
+  INLINE uint64_t get_VAL(void) const volatile { return pVal[0]; }
+  INLINE void set_VAL(uint64_t value) { pVal[0] = value; }
+  INLINE uint64_t& get_pVal(int index) { return pVal[index]; }
+  INLINE uint64_t* get_pVal() { return pVal; }
+  INLINE const uint64_t* get_pVal() const { return pVal; }
+  INLINE uint64_t get_pVal(int index) const { return pVal[index]; }
+  INLINE uint64_t* get_pVal() const volatile { return pVal; }
+  INLINE uint64_t get_pVal(int index) const volatile { return pVal[index]; }
+  INLINE void set_pVal(int i, uint64_t value) { pVal[i] = value; }
+
+  /// This enum is used to hold the constants we needed for ap_private.
+  enum {
+    APINT_BITS_PER_WORD = sizeof(uint64_t) * 8, ///< Bits in a word
+    APINT_WORD_SIZE = sizeof(uint64_t)          ///< Byte size of a word
+  };
+
+  enum {
+    excess_bits = (_AP_W % APINT_BITS_PER_WORD)
+                      ? APINT_BITS_PER_WORD - (_AP_W % APINT_BITS_PER_WORD)
+                      : 0
+  };
+  static const uint64_t mask = ((uint64_t)~0ULL >> (excess_bits));
+
+ public:
+  // NOTE changed to explicit to be consistent with ap_private<W,S,true>
+  explicit INLINE ap_private(const char* val) {
+    set_canary();
+    unsigned char radix = 10;
+    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
+    std::string::size_type pos = str.find('.');
+    if (pos != std::string::npos) str = str.substr(pos);
+    ap_private ap_private_val(str, radix);
+    operator=(ap_private_val);
+    report();
+    check_canary();
+  }
+
+  INLINE ap_private(const char* val, unsigned char rd) {
+    set_canary();
+    unsigned char radix = rd;
+    std::string str = ap_private_ops::parseString(val, radix); // determine radix.
+    std::string::size_type pos = str.find('.');
+    if (pos != std::string::npos) str = str.substr(pos);
+    ap_private ap_private_val(str, radix);
+    operator=(ap_private_val);
+    report();
+
+    report();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ref.get();
+    report();
+    check_canary();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private(const _private_bit_ref<_AP_W2, _AP_S2>& ref) {
+    set_canary();
+    *this = ((uint64_t)(bool)ref);
+    report();
+    check_canary();
+  }
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& ref) {
+//    set_canary();
+//    *this = ref.get();
+//    report();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = ((val.operator ap_private<_AP_W2, false>()));
+//    report();
+//    check_canary();
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_private(
+//      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+//    set_canary();
+//    *this = (uint64_t)(bool)val;
+//    report();
+//    check_canary();
+//  }
+
+  /// Simply makes *this a copy of that.
+  /// @brief Copy Constructor.
+  INLINE ap_private(const ap_private& that) {
+      set_canary();
+      memcpy(pVal, that.get_pVal(), _AP_N * APINT_WORD_SIZE);
+      clearUnusedBits();
+      check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, false>& that) {
+    set_canary();
+    operator=(that);
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, false>& that) {
+    set_canary();
+    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, false>&>(that));
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const ap_private<_AP_W1, _AP_S1, true>& that) {
+    set_canary();
+    static const uint64_t that_sign_ext_mask =
+        (_AP_W1 == APINT_BITS_PER_WORD)
+            ? 0
+            : ~0ULL >> (_AP_W1 % APINT_BITS_PER_WORD)
+                           << (_AP_W1 % APINT_BITS_PER_WORD);
+    if (that.isNegative()) {
+      pVal[0] = that.get_VAL() | that_sign_ext_mask;
+      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1));
+    } else {
+      pVal[0] = that.get_VAL();
+      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));
+    }
+    clearUnusedBits();
+    check_canary();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private(const volatile ap_private<_AP_W1, _AP_S1, true>& that) {
+    set_canary();
+    operator=(const_cast<const ap_private<_AP_W1, _AP_S1, true>&>(that));
+    check_canary();
+  }
+
+  /// @brief Destructor.
+  // virtual ~ap_private() {}
+  INLINE ~ap_private() { check_canary(); }
+
+  /// @name Constructors
+  /// @{
+
+  /// Default constructor that creates an uninitialized ap_private.  This is
+  /// useful
+  ///  for object deserialization (pair this with the static method Read).
+  INLINE ap_private() {
+    set_canary();
+    clearUnusedBits();
+    check_canary();
+  }
+
+  INLINE ap_private(uint64_t* val, uint32_t bits = _AP_W) { assert(0); }
+  INLINE ap_private(const uint64_t* const val, uint32_t bits) { assert(0); }
+
+/// If isSigned is true then val is treated as if it were a signed value
+/// (i.e. as an int64_t) and the appropriate sign extension to the bit width
+/// will be done. Otherwise, no sign extension occurs (high order bits beyond
+/// the range of val are zero filled).
+/// @param numBits the bit width of the constructed ap_private
+/// @param val the initial value of the ap_private
+/// @param isSigned how to treat signedness of val
+/// @brief Create a new ap_private of numBits width, initialized as val.
+#define CTOR(TYPE, SIGNED)                                  \
+  INLINE ap_private(TYPE val, bool isSigned = SIGNED) {     \
+    set_canary();                                           \
+    pVal[0] = (ValType)val;                                 \
+    if (isSigned && int64_t(pVal[0]) < 0) {                 \
+      memset(pVal + 1, ~0, sizeof(uint64_t) * (_AP_N - 1)); \
+    } else {                                                \
+      memset(pVal + 1, 0, sizeof(uint64_t) * (_AP_N - 1));  \
+    }                                                       \
+    clearUnusedBits();                                      \
+    check_canary();                                         \
+  }
+
+  CTOR(bool, false)
+  CTOR(char, CHAR_IS_SIGNED)
+  CTOR(signed char, true)
+  CTOR(unsigned char, false)
+  CTOR(short, true)
+  CTOR(unsigned short, false)
+  CTOR(int, true)
+  CTOR(unsigned int, false)
+  CTOR(long, true)
+  CTOR(unsigned long, false)
+  CTOR(ap_slong, true)
+  CTOR(ap_ulong, false)
+#if 0
+  CTOR(half, false)
+  CTOR(float, false)
+  CTOR(double, false)
+#endif
+#undef CTOR
+
+  /// @returns true if the number of bits <= 64, false otherwise.
+  /// @brief Determine if this ap_private just has one word to store value.
+  INLINE bool isSingleWord() const { return false; }
+
+  /// @returns the word position for the specified bit position.
+  /// @brief Determine which word a bit is in.
+  static INLINE uint32_t whichWord(uint32_t bitPosition) {
+    //    return bitPosition / APINT_BITS_PER_WORD;
+    return (bitPosition) >> 6;
+  }
+
+  /// @returns the bit position in a word for the specified bit position
+  /// in the ap_private.
+  /// @brief Determine which bit in a word a bit is in.
+  static INLINE uint32_t whichBit(uint32_t bitPosition) {
+    //    return bitPosition % APINT_BITS_PER_WORD;
+    return bitPosition & 0x3f;
+  }
+
+  /// bit at a specific bit position. This is used to mask the bit in the
+  /// corresponding word.
+  /// @returns a uint64_t with only bit at "whichBit(bitPosition)" set
+  /// @brief Get a single bit mask.
+  static INLINE uint64_t maskBit(uint32_t bitPosition) {
+    return 1ULL << (whichBit(bitPosition));
+  }
+
+  /// @returns the corresponding word for the specified bit position.
+  /// @brief Get the word corresponding to a bit position
+  INLINE uint64_t getWord(uint32_t bitPosition) const {
+    return pVal[whichWord(bitPosition)];
+  }
+
+  /// This method is used internally to clear the to "N" bits in the high order
+  /// word that are not used by the ap_private. This is needed after the most
+  /// significant word is assigned a value to ensure that those bits are
+  /// zero'd out.
+  /// @brief Clear unused high order bits
+  INLINE void clearUnusedBits(void) volatile
+// just for clang compiler
+#if defined(__clang__) && !defined(__CLANG_3_1__)
+      __attribute__((no_sanitize("undefined")))
+#endif
+  {
+    pVal[_AP_N - 1] =
+        _AP_S ? ((((int64_t)pVal[_AP_N - 1]) << (excess_bits)) >> excess_bits)
+              : (excess_bits
+                     ? ((pVal[_AP_N - 1]) << (excess_bits)) >> (excess_bits)
+                     : pVal[_AP_N - 1]);
+  }
+
+  INLINE void clearUnusedBitsToZero(void) { pVal[_AP_N - 1] &= mask; }
+
+  INLINE void clearUnusedBitsToOne(void) { pVal[_AP_N - 1] |= mask; }
+
+  /// This is used by the constructors that take string arguments.
+  /// @brief Convert a char array into an ap_private
+  INLINE void fromString(const char* str, uint32_t slen, uint8_t radix) {
+    enum { numbits = _AP_W };
+    bool isNeg = str[0] == '-';
+    if (isNeg) {
+      str++;
+      slen--;
+    }
+
+    if (str[0] == '0' && (str[1] == 'b' || str[1] == 'B')) {
+      //if(radix == 0) radix = 2;
+      _AP_WARNING(radix != 2, "%s seems to have base %d, but %d given.", str, 2, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'o' || str[1] == 'O')) {
+      //if (radix == 0) radix = 8;
+      _AP_WARNING(radix != 8, "%s seems to have base %d, but %d given.", str, 8, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X')) {
+      //if (radix == 0) radix = 16;
+      _AP_WARNING(radix != 16, "%s seems to have base %d, but %d given.", str, 16, radix);
+      str += 2;
+      slen -=2;
+    } else if (str[0] == '0' && (str[1] == 'd' || str[1] == 'D')) {
+      //if (radix == 0) radix = 10;
+      _AP_WARNING(radix != 10, "%s seems to have base %d, but %d given.", str, 10, radix);
+      str += 2;
+      slen -=2;
+    } else if (radix == 0) {
+      //radix = 2; // XXX default value
+    }
+
+    // Check our assumptions here
+    assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+           "Radix should be 2, 8, 10, or 16!");
+    assert(str && "String is null?");
+
+    // skip any leading zero
+    while (*str == '0' && *(str + 1) != '\0') {
+      str++;
+      slen--;
+    }
+    assert((slen <= numbits || radix != 2) && "Insufficient bit width");
+    assert(((slen - 1) * 3 <= numbits || radix != 8) &&
+           "Insufficient bit width");
+    assert(((slen - 1) * 4 <= numbits || radix != 16) &&
+           "Insufficient bit width");
+    assert((((slen - 1) * 64) / 22 <= numbits || radix != 10) &&
+           "Insufficient bit width");
+
+    // clear bits
+    memset(pVal, 0, _AP_N * sizeof(uint64_t));
+
+    // Figure out if we can shift instead of multiply
+    uint32_t shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
+
+    // Set up an ap_private for the digit to add outside the loop so we don't
+    // constantly construct/destruct it.
+    uint64_t bigVal[_AP_N];
+    memset(bigVal, 0, _AP_N * sizeof(uint64_t));
+    ap_private<_AP_W, _AP_S> apdigit(getBitWidth(), bigVal);
+    ap_private<_AP_W, _AP_S> apradix(radix);
+
+    // Enter digit traversal loop
+    for (unsigned i = 0; i < slen; i++) {
+      // Get a digit
+      uint32_t digit = 0;
+      char cdigit = str[i];
+      if (radix == 16) {
+#define isxdigit(c)                                            \
+  (((c) >= '0' && (c) <= '9') || ((c) >= 'a' && (c) <= 'f') || \
+   ((c) >= 'A' && (c) <= 'F'))
+#define isdigit(c) ((c) >= '0' && (c) <= '9')
+        if (!isxdigit(cdigit)) assert(0 && "Invalid hex digit in string");
+        if (isdigit(cdigit))
+          digit = cdigit - '0';
+        else if (cdigit >= 'a')
+          digit = cdigit - 'a' + 10;
+        else if (cdigit >= 'A')
+          digit = cdigit - 'A' + 10;
+        else
+          assert(0 && "huh? we shouldn't get here");
+      } else if (isdigit(cdigit)) {
+        digit = cdigit - '0';
+      } else if (cdigit != '\0') {
+        assert(0 && "Invalid character in digit string");
+      }
+#undef isxdigit
+#undef isdigit
+      // Shift or multiply the value by the radix
+      if (shift)
+        *this <<= shift;
+      else
+        *this *= apradix;
+
+      // Add in the digit we just interpreted
+      apdigit.set_VAL(digit);
+      *this += apdigit;
+    }
+    // If its negative, put it in two's complement form
+    if (isNeg) {
+      (*this)--;
+      this->flip();
+    }
+    clearUnusedBits();
+  }
+
+  INLINE ap_private read() volatile { return *this; }
+
+  INLINE void write(const ap_private& op2) volatile { *this = (op2); }
+
+  INLINE operator ValType() const { return get_VAL(); }
+
+  INLINE int to_uchar() const { return (unsigned char)get_VAL(); }
+
+  INLINE int to_char() const { return (signed char)get_VAL(); }
+
+  INLINE int to_ushort() const { return (unsigned short)get_VAL(); }
+
+  INLINE int to_short() const { return (short)get_VAL(); }
+
+  INLINE int to_int() const { return (int)get_VAL(); }
+
+  INLINE unsigned to_uint() const { return (unsigned)get_VAL(); }
+
+  INLINE long to_long() const { return (long)get_VAL(); }
+
+  INLINE unsigned long to_ulong() const { return (unsigned long)get_VAL(); }
+
+  INLINE ap_slong to_int64() const { return (ap_slong)get_VAL(); }
+
+  INLINE ap_ulong to_uint64() const { return (ap_ulong)get_VAL(); }
+
+  INLINE double to_double() const {
+    if (isNegative())
+      return roundToDouble(true);
+    else
+      return roundToDouble(false);
+  }
+
+  INLINE unsigned length() const { return _AP_W; }
+
+  /*Reverse the contents of ap_private instance. I.e. LSB becomes MSB and vise
+   * versa*/
+  INLINE ap_private& reverse() {
+    for (int i = 0; i < _AP_W / 2; ++i) {
+      bool tmp = operator[](i);
+      if (operator[](_AP_W - 1 - i))
+        set(i);
+      else
+        clear(i);
+      if (tmp)
+        set(_AP_W - 1 - i);
+      else
+        clear(_AP_W - 1 - i);
+    }
+    clearUnusedBits();
+    return *this;
+  }
+
+  /*Return true if the value of ap_private instance is zero*/
+  INLINE bool iszero() const { return isMinValue(); }
+
+  INLINE bool to_bool() const { return !iszero(); }
+
+  /* x < 0 */
+  INLINE bool sign() const {
+    if (isNegative()) return true;
+    return false;
+  }
+
+  /* x[i] = !x[i] */
+  INLINE void invert(int i) {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    flip(i);
+  }
+
+  /* x[i] */
+  INLINE bool test(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return operator[](i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // Set the ith bit into v
+  INLINE void set_bit(int i, bool v) {
+    assert(i >= 0 && "Attempting to write bit with negative index");
+    assert(i < _AP_W && "Attempting to write bit beyond MSB");
+    v ? set(i) : clear(i);
+  }
+
+  // FIXME different argument for different action?
+  INLINE ap_private& set(uint32_t bitPosition) {
+    pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
+    clearUnusedBits();
+    return *this;
+  }
+
+  INLINE void set() {
+    for (int i = 0; i < _AP_N; ++i) pVal[i] = ~0ULL;
+    clearUnusedBits();
+  }
+
+  // Get the value of ith bit
+  INLINE bool get(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
+  }
+
+  // Get the value of ith bit
+  INLINE bool get_bit(int i) const {
+    assert(i >= 0 && "Attempting to read bit with negative index");
+    assert(i < _AP_W && "Attempting to read bit beyond MSB");
+    return ((maskBit(i) & (pVal[whichWord(i)])) != 0);
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the left
+  INLINE void lrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(shl(n) | lshr(_AP_W - n));
+  }
+
+  // This is used for sc_lv and sc_bv, which is implemented by sc_uint
+  // Rotate an ap_private object n places to the right
+  INLINE void rrotate(int n) {
+    assert(n >= 0 && "Attempting to shift negative index");
+    assert(n < _AP_W && "Shift value larger than bit width");
+    operator=(lshr(n) | shl(_AP_W - n));
+  }
+
+  /// Set the given bit to 0 whose position is given as "bitPosition".
+  /// @brief Set a given bit to 0.
+  INLINE ap_private& clear(uint32_t bitPosition) {
+    pVal[whichWord(bitPosition)] &= ~maskBit(bitPosition);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @brief Set every bit to 0.
+  INLINE void clear() { memset(pVal, 0, _AP_N * APINT_WORD_SIZE); }
+
+  /// @brief Toggle every bit to its opposite value.
+  ap_private& flip() {
+    for (int i = 0; i < _AP_N; ++i) pVal[i] ^= ~0ULL;
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @brief Toggles a given bit to its opposite value.
+  INLINE ap_private& flip(uint32_t bitPosition) {
+    assert(bitPosition < BitWidth && "Out of the bit-width range!");
+    set_bit(bitPosition, !get_bit(bitPosition));
+    return *this;
+  }
+
+  // complements every bit
+  INLINE void b_not() { flip(); }
+
+  INLINE ap_private getLoBits(uint32_t numBits) const {
+    return ap_private_ops::lshr(ap_private_ops::shl(*this, _AP_W - numBits),
+                                _AP_W - numBits);
+  }
+
+  INLINE ap_private getHiBits(uint32_t numBits) const {
+    return ap_private_ops::lshr(*this, _AP_W - numBits);
+  }
+
+  // Binary Arithmetic
+  //-----------------------------------------------------------
+
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator&(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this & a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator|(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this | a2.get();
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_private<AP_MAX(_AP_W2 + _AP_W3, _AP_W), _AP_S> operator^(
+//      const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>& a2) {
+//    return *this ^ a2.get();
+//  }
+
+/// Arithmetic assign
+//-------------------------------------------------------------
+
+#define OP_BIN_LOGIC_ASSIGN_AP(Sym)                                            \
+  template <int _AP_W1, bool _AP_S1>                                           \
+  INLINE ap_private& operator Sym(const ap_private<_AP_W1, _AP_S1>& RHS) {     \
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                      \
+    uint32_t numWords = AESL_std::min((int)_AP_N, _AP_N1);                     \
+    uint32_t i;                                                                \
+    if (_AP_W != _AP_W1)                                                       \
+      fprintf(stderr,                                                          \
+              "Warning! Bitsize mismach for ap_[u]int " #Sym " ap_[u]int.\n"); \
+    for (i = 0; i < numWords; ++i) pVal[i] Sym RHS.get_pVal(i);                \
+    if (_AP_N1 < _AP_N) {                                                      \
+      uint64_t ext = RHS.isNegative() ? ~0ULL : 0;                             \
+      for (; i < _AP_N; i++) pVal[i] Sym ext;                                  \
+    }                                                                          \
+    clearUnusedBits();                                                         \
+    return *this;                                                              \
+  }
+
+  OP_BIN_LOGIC_ASSIGN_AP(&=);
+  OP_BIN_LOGIC_ASSIGN_AP(|=);
+  OP_BIN_LOGIC_ASSIGN_AP(^=);
+#undef OP_BIN_LOGIC_ASSIGN_AP
+
+  /// Adds the RHS APint to this ap_private.
+  /// @returns this, after addition of RHS.
+  /// @brief Addition assignment operator.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator+=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    uint64_t RHSpVal[_AP_N1];
+    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
+    ap_private_ops::add(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
+                        _AP_S1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator-=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    uint64_t RHSpVal[_AP_N1];
+    for (int i = 0; i < _AP_N1; ++i) RHSpVal[i] = RHS.get_pVal(i);
+    ap_private_ops::sub(pVal, pVal, RHSpVal, _AP_N, _AP_N, _AP_N1, _AP_S,
+                        _AP_S1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator*=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    // Get some bit facts about LHS and check for zero
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : whichWord(lhsBits - 1) + 1;
+    if (!lhsWords) {
+      // 0 * X ===> 0
+      return *this;
+    }
+
+    ap_private dupRHS = RHS;
+    // Get some bit facts about RHS and check for zero
+    uint32_t rhsBits = dupRHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : whichWord(rhsBits - 1) + 1;
+    if (!rhsWords) {
+      // X * 0 ===> 0
+      clear();
+      return *this;
+    }
+
+    // Allocate space for the result
+    uint32_t destWords = rhsWords + lhsWords;
+    uint64_t* dest = (uint64_t*)malloc(destWords * sizeof(uint64_t));
+
+    // Perform the long multiply
+    ap_private_ops::mul(dest, pVal, lhsWords, dupRHS.get_pVal(), rhsWords,
+                        destWords);
+
+    // Copy result back into *this
+    clear();
+    uint32_t wordsToCopy = destWords >= _AP_N ? _AP_N : destWords;
+
+    memcpy(pVal, dest, wordsToCopy * APINT_WORD_SIZE);
+
+    uint64_t ext = (isNegative() ^ RHS.isNegative()) ? ~0ULL : 0ULL;
+    for (int i = wordsToCopy; i < _AP_N; i++) pVal[i] = ext;
+    clearUnusedBits();
+    // delete dest array and return
+    free(dest);
+    return *this;
+  }
+
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }
+
+  OP_ASSIGN_AP(/)
+  OP_ASSIGN_AP(%)
+#undef OP_ASSIGN_AP
+
+#define OP_BIN_LOGIC_AP(Sym)                                                  \
+  template <int _AP_W1, bool _AP_S1>                                          \
+  INLINE typename RType<_AP_W1, _AP_S1>::logic operator Sym(                  \
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {                          \
+    enum {                                                                    \
+      numWords = (RType<_AP_W1, _AP_S1>::logic_w + APINT_BITS_PER_WORD - 1) / \
+                 APINT_BITS_PER_WORD                                          \
+    };                                                                        \
+    typename RType<_AP_W1, _AP_S1>::logic Result;                             \
+    uint32_t i;                                                               \
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;                     \
+    uint32_t min_N = std::min((int)_AP_N, _AP_N1);                            \
+    uint32_t max_N = std::max((int)_AP_N, _AP_N1);                            \
+    for (i = 0; i < min_N; ++i)                                               \
+      Result.set_pVal(i, pVal[i] Sym RHS.get_pVal(i));                        \
+    if (numWords > i) {                                                       \
+      uint64_t ext = ((_AP_N < _AP_N1 && isNegative()) ||                     \
+                      (_AP_N1 < _AP_N && RHS.isNegative()))                   \
+                         ? ~0ULL                                              \
+                         : 0;                                                 \
+      if (_AP_N > _AP_N1)                                                     \
+        for (; i < max_N; i++) Result.set_pVal(i, pVal[i] Sym ext);           \
+      else                                                                    \
+        for (; i < max_N; i++) Result.set_pVal(i, RHS.get_pVal(i) Sym ext);   \
+      if (numWords > i) {                                                     \
+        uint64_t ext2 = ((_AP_N > _AP_N1 && isNegative()) ||                  \
+                         (_AP_N1 > _AP_N && RHS.isNegative()))                \
+                            ? ~0ULL                                           \
+                            : 0;                                              \
+        Result.set_pVal(i, ext Sym ext2);                                     \
+      }                                                                       \
+    }                                                                         \
+    Result.clearUnusedBits();                                                 \
+    return Result;                                                            \
+  }
+
+  OP_BIN_LOGIC_AP(|);
+  OP_BIN_LOGIC_AP(&);
+  OP_BIN_LOGIC_AP(^);
+
+#undef OP_BIN_LOGIC_AP
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::plus operator+(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::plus Result, lhs(*this), rhs(RHS);
+    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::plus_w + 63) / 64;
+    ap_private_ops::add(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
+                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::minus operator-(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::minus Result, lhs(*this), rhs(RHS);
+    const int Result_AP_N = (RType<_AP_W1, _AP_S1>::minus_w + 63) / 64;
+    ap_private_ops::sub(Result.get_pVal(), lhs.get_pVal(), rhs.get_pVal(),
+                        Result_AP_N, Result_AP_N, Result_AP_N, _AP_S, _AP_S1);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE typename RType<_AP_W1, _AP_S1>::mult operator*(
+      const ap_private<_AP_W1, _AP_S1>& RHS) const {
+    typename RType<_AP_W1, _AP_S1>::mult temp = *this;
+    temp *= RHS;
+    return temp;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::div operator/(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    return typename RType<_AP_W2, _AP_S2>::div(
+        (_AP_S || _AP_S2) ? lhs.sdiv(rhs) : lhs.udiv(rhs));
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE typename RType<_AP_W2, _AP_S2>::mod operator%(
+      const ap_private<_AP_W2, _AP_S2>& op) const {
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        lhs = *this;
+    ap_private<AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2)),
+               (_AP_W > _AP_W2 ? _AP_S
+                               : (_AP_W2 > _AP_W ? _AP_S2 : _AP_S || _AP_S2))>
+        rhs = op;
+    typename RType<_AP_W2, _AP_S2>::mod res =
+        typename RType<_AP_W2, _AP_S2>::mod(_AP_S ? lhs.srem(rhs)
+                                                  : lhs.urem(rhs));
+    return res;
+  }
+
+#define OP_LEFT_SHIFT_CTYPE(TYPE, SIGNED)             \
+  INLINE ap_private operator<<(const TYPE op) const { \
+    if (op >= _AP_W) return ap_private(0);            \
+    if (SIGNED && op < 0) return *this >> (0 - op);   \
+    return shl(op);                                   \
+  }
+
+  OP_LEFT_SHIFT_CTYPE(int, true)
+  // OP_LEFT_SHIFT_CTYPE(bool, false)
+  OP_LEFT_SHIFT_CTYPE(signed char, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned char, false)
+  OP_LEFT_SHIFT_CTYPE(short, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned short, false)
+  OP_LEFT_SHIFT_CTYPE(unsigned int, false)
+  OP_LEFT_SHIFT_CTYPE(long, true)
+  OP_LEFT_SHIFT_CTYPE(unsigned long, false)
+  OP_LEFT_SHIFT_CTYPE(unsigned long long, false)
+  OP_LEFT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_LEFT_SHIFT_CTYPE(half, false)
+  OP_LEFT_SHIFT_CTYPE(float, false)
+  OP_LEFT_SHIFT_CTYPE(double, false)
+#endif
+#undef OP_LEFT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator<<(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this << sh;
+    } else {
+      int sh = op2.to_int();
+      return *this << sh;
+    }
+  }
+
+#define OP_RIGHT_SHIFT_CTYPE(TYPE, SIGNED)            \
+  INLINE ap_private operator>>(const TYPE op) const { \
+    if (op >= _AP_W) {                                \
+      if (isNegative())                               \
+        return ap_private(-1);                        \
+      else                                            \
+        return ap_private(0);                         \
+    }                                                 \
+    if ((SIGNED) && op < 0) return *this << (0 - op); \
+    if (_AP_S)                                        \
+      return ashr(op);                                \
+    else                                              \
+      return lshr(op);                                \
+  }
+
+  // OP_RIGHT_SHIFT_CTYPE(bool, false)
+  OP_RIGHT_SHIFT_CTYPE(char, CHAR_IS_SIGNED)
+  OP_RIGHT_SHIFT_CTYPE(signed char, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned char, false)
+  OP_RIGHT_SHIFT_CTYPE(short, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned short, false)
+  OP_RIGHT_SHIFT_CTYPE(int, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned int, false)
+  OP_RIGHT_SHIFT_CTYPE(long, true)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long, false)
+  OP_RIGHT_SHIFT_CTYPE(unsigned long long, false)
+  OP_RIGHT_SHIFT_CTYPE(long long, true)
+#if 0
+  OP_RIGHT_SHIFT_CTYPE(half, false)
+  OP_RIGHT_SHIFT_CTYPE(float, false)
+  OP_RIGHT_SHIFT_CTYPE(double, false)
+#endif
+#undef OP_RIGHT_SHIFT_CTYPE
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private operator>>(const ap_private<_AP_W2, _AP_S2>& op2) const {
+    if (_AP_S2 == false) {
+      uint32_t sh = op2.to_uint();
+      return *this >> sh;
+    } else {
+      int sh = op2.to_int();
+      return *this >> sh;
+    }
+  }
+
+  /// Shift assign
+  //------------------------------------------------------------------
+  // TODO call clearUnusedBits ?
+#define OP_ASSIGN_AP(Sym)                                                    \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(int op) {                               \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }                                                                          \
+  INLINE ap_private& operator Sym##=(unsigned int op) {                      \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }                                                                          \
+  template <int _AP_W2, bool _AP_S2>                                         \
+  INLINE ap_private& operator Sym##=(const ap_private<_AP_W2, _AP_S2>& op) { \
+    *this = operator Sym(op);                                                \
+    return *this;                                                            \
+  }
+  OP_ASSIGN_AP(>>)
+  OP_ASSIGN_AP(<<)
+#undef OP_ASSIGN_AP
+
+  /// Comparisons
+  //-----------------------------------------------------------------
+  INLINE bool operator==(const ap_private& RHS) const {
+    // Get some facts about the number of bits used in the two operands.
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 = RHS.getActiveBits();
+
+    // If the number of bits isn't the same, they aren't equal
+    if (n1 != n2) return false;
+
+    // If the number of bits fits in a word, we only need to compare the low
+    // word.
+    if (n1 <= APINT_BITS_PER_WORD) return pVal[0] == RHS.get_pVal(0);
+
+    // Otherwise, compare everything
+    for (int i = whichWord(n1 - 1); i >= 0; --i)
+      if (pVal[i] != RHS.get_pVal(i)) return false;
+    return true;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W, _AP_W2),
+    };
+    ap_private<_AP_MAX_W, false> lhs(*this);
+    ap_private<_AP_MAX_W, false> rhs(op);
+    return lhs == rhs;
+  }
+
+  INLINE bool operator==(uint64_t Val) const {
+    uint32_t n = getActiveBits();
+    if (n <= APINT_BITS_PER_WORD)
+      return pVal[0] == Val;
+    else
+      return false;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this == op);
+  }
+
+  template <bool _AP_S1>
+  INLINE bool operator!=(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  INLINE bool operator!=(uint64_t Val) const { return !((*this) == Val); }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this > op);
+  }
+
+  INLINE bool operator<(const ap_private& op) const {
+    return _AP_S ? slt(op) : ult(op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.slt(rhs) : lhs.ult(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ult(rhs);
+      else
+        return lhs.slt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ult(rhs);
+    else
+      return lhs.slt(rhs);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const ap_private<_AP_W2, _AP_S2>& op) const {
+    return !(*this < op);
+  }
+
+  INLINE bool operator>(const ap_private& op) const {
+    return _AP_S ? sgt(op) : ugt(op);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const ap_private<_AP_W2, _AP_S2>& op) const {
+    enum {
+      _AP_MAX_W = AP_MAX(_AP_W + (_AP_S || _AP_S2), _AP_W2 + (_AP_S || _AP_S2))
+    };
+    ap_private<_AP_MAX_W, _AP_S> lhs(*this);
+    ap_private<_AP_MAX_W, _AP_S2> rhs(op);
+    if (_AP_S == _AP_S2)
+      return _AP_S ? lhs.sgt(rhs) : lhs.ugt(rhs);
+    else if (_AP_S)
+      if (_AP_W2 >= _AP_W)
+        return lhs.ugt(rhs);
+      else
+        return lhs.sgt(rhs);
+    else if (_AP_W >= _AP_W2)
+      return lhs.ugt(rhs);
+    else
+      return lhs.sgt(rhs);
+  }
+
+  /// Bit and Part Select
+  //--------------------------------------------------------------
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>*>(this), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) const {
+    return _private_range_ref<_AP_W, _AP_S>(
+        (const_cast<ap_private<_AP_W, _AP_S>*>(this)), Hi, Lo);
+  }
+
+  INLINE _private_range_ref<_AP_W, _AP_S> range(int Hi, int Lo) {
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> range(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(this, Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> range(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return _private_range_ref<_AP_W, _AP_S>(const_cast<ap_private*>(this), Hi, Lo);
+  }
+
+  template <int _AP_W2, bool _AP_S2, int _AP_W3, bool _AP_S3>
+  INLINE _private_range_ref<_AP_W, _AP_S> operator()(
+      const ap_private<_AP_W2, _AP_S2>& HiIdx,
+      const ap_private<_AP_W3, _AP_S3>& LoIdx) const {
+    int Hi = HiIdx.to_int();
+    int Lo = LoIdx.to_int();
+    return this->range(Hi, Lo);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> operator[](int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(int index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_bit_ref<_AP_W, _AP_S> bit(const ap_private<_AP_W2, _AP_S2>& index) {
+    return _private_bit_ref<_AP_W, _AP_S>(*this, index.to_int());
+  }
+
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(int index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE const _private_bit_ref<_AP_W, _AP_S> bit(
+      const ap_private<_AP_W2, _AP_S2>& index) const {
+    return _private_bit_ref<_AP_W, _AP_S>(
+        const_cast<ap_private<_AP_W, _AP_S>&>(*this), index.to_int());
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(ap_private<_AP_W2, _AP_S2>& a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  concat(const ap_private<_AP_W2, _AP_S2>& a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        *this, const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private, _AP_W2, ap_private<_AP_W2, _AP_S2> >
+//  operator,(const ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_private<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(_private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, ap_private<_AP_W, _AP_S>, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(*this,
+//                                                                         a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, ap_private, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this,
+//                                                                       a2);
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<ap_private<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, ap_private, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(
+//          af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, ap_private, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(*this, a2);
+//  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> ret(*this);
+    return ret;
+  }
+
+  template <int _AP_W3>
+  INLINE void set(const ap_private<_AP_W3, false>& val) {
+    operator=(ap_private<_AP_W3, _AP_S>(val));
+  }
+
+  ///
+  /// @name Value Tests
+  ///
+  /// This tests the high bit of this ap_private to determine if it is set.
+  /// @returns true if this ap_private is negative, false otherwise
+  /// @brief Determine sign of this ap_private.
+  INLINE bool isNegative() const {
+    // just for get rid of warnings
+    enum { shift = (_AP_W - APINT_BITS_PER_WORD * (_AP_N - 1) - 1) };
+    static const uint64_t mask = 1ULL << (shift);
+    return _AP_S && (pVal[_AP_N - 1] & mask);
+  }
+
+  /// This tests the high bit of the ap_private to determine if it is unset.
+  /// @brief Determine if this ap_private Value is positive (not negative).
+  INLINE bool isPositive() const { return !isNegative(); }
+
+  /// This tests if the value of this ap_private is strictly positive (> 0).
+  /// @returns true if this ap_private is Positive and not zero.
+  /// @brief Determine if this ap_private Value is strictly positive.
+  INLINE bool isStrictlyPositive() const {
+    return isPositive() && (*this) != 0;
+  }
+
+  /// This checks to see if the value has all bits of the ap_private are set or
+  /// not.
+  /// @brief Determine if all bits are set
+  INLINE bool isAllOnesValue() const { return countPopulation() == _AP_W; }
+
+  /// This checks to see if the value of this ap_private is the maximum unsigned
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the largest unsigned value.
+  INLINE bool isMaxValue() const { return countPopulation() == _AP_W; }
+
+  /// This checks to see if the value of this ap_private is the maximum signed
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the largest signed value.
+  INLINE bool isMaxSignedValue() const {
+    return !isNegative() && countPopulation() == _AP_W - 1;
+  }
+
+  /// This checks to see if the value of this ap_private is the minimum unsigned
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the smallest unsigned value.
+  INLINE bool isMinValue() const { return countPopulation() == 0; }
+
+  /// This checks to see if the value of this ap_private is the minimum signed
+  /// value for the ap_private's bit width.
+  /// @brief Determine if this is the smallest signed value.
+  INLINE bool isMinSignedValue() const {
+    return isNegative() && countPopulation() == 1;
+  }
+
+  /// This function returns a pointer to the internal storage of the ap_private.
+  /// This is useful for writing out the ap_private in binary form without any
+  /// conversions.
+  INLINE const uint64_t* getRawData() const { return &pVal[0]; }
+
+  // Square Root - this method computes and returns the square root of "this".
+  // Three mechanisms are used for computation. For small values (<= 5 bits),
+  // a table lookup is done. This gets some performance for common cases. For
+  // values using less than 52 bits, the value is converted to double and then
+  // the libc sqrt function is called. The result is rounded and then converted
+  // back to a uint64_t which is then used to construct the result. Finally,
+  // the Babylonian method for computing square roots is used.
+  INLINE ap_private sqrt() const {
+    // Determine the magnitude of the value.
+    uint32_t magnitude = getActiveBits();
+
+    // Use a fast table for some small values. This also gets rid of some
+    // rounding errors in libc sqrt for small values.
+    if (magnitude <= 5) {
+      static const uint8_t results[32] = {
+          /*     0 */ 0,
+          /*  1- 2 */ 1, 1,
+          /*  3- 6 */ 2, 2, 2, 2,
+          /*  7-12 */ 3, 3, 3, 3, 3, 3,
+          /* 13-20 */ 4, 4, 4, 4, 4, 4, 4, 4,
+          /* 21-30 */ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+          /*    31 */ 6};
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/ results[get_VAL()]);
+    }
+
+    // If the magnitude of the value fits in less than 52 bits (the precision of
+    // an IEEE double precision floating point value), then we can use the
+    // libc sqrt function which will probably use a hardware sqrt computation.
+    // This should be faster than the algorithm below.
+    if (magnitude < 52) {
+#ifdef _MSC_VER
+      // Amazingly, VC++ doesn't have round().
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
+                                      uint64_t(::sqrt(double(get_VAL()))) +
+                                      0.5);
+#else
+      return ap_private<_AP_W, _AP_S>(/*BitWidth,*/
+                                      uint64_t(
+                                          ::round(::sqrt(double(get_VAL())))));
+#endif
+    }
+
+    // Okay, all the short cuts are exhausted. We must compute it. The following
+    // is a classical Babylonian method for computing the square root. This code
+    // was adapted to APINt from a wikipedia article on such computations.
+    // See http://www.wikipedia.org/ and go to the page named
+    // Calculate_an_integer_square_root.
+    uint32_t nbits = BitWidth, i = 4;
+    ap_private<_AP_W, _AP_S> testy(16);
+    ap_private<_AP_W, _AP_S> x_old(/*BitWidth,*/ 1);
+    ap_private<_AP_W, _AP_S> x_new(0);
+    ap_private<_AP_W, _AP_S> two(/*BitWidth,*/ 2);
+
+    // Select a good starting value using binary logarithms.
+    for (;; i += 2, testy = testy.shl(2))
+      if (i >= nbits || this->ule(testy)) {
+        x_old = x_old.shl(i / 2);
+        break;
+      }
+
+    // Use the Babylonian method to arrive at the integer square root:
+    for (;;) {
+      x_new = (this->udiv(x_old) + x_old).udiv(two);
+      if (x_old.ule(x_new)) break;
+      x_old = x_new;
+    }
+
+    // Make sure we return the closest approximation
+    // NOTE: The rounding calculation below is correct. It will produce an
+    // off-by-one discrepancy with results from pari/gp. That discrepancy has
+    // been
+    // determined to be a rounding issue with pari/gp as it begins to use a
+    // floating point representation after 192 bits. There are no discrepancies
+    // between this algorithm and pari/gp for bit widths < 192 bits.
+    ap_private<_AP_W, _AP_S> square(x_old * x_old);
+    ap_private<_AP_W, _AP_S> nextSquare((x_old + 1) * (x_old + 1));
+    if (this->ult(square))
+      return x_old;
+    else if (this->ule(nextSquare)) {
+      ap_private<_AP_W, _AP_S> midpoint((nextSquare - square).udiv(two));
+      ap_private<_AP_W, _AP_S> offset(*this - square);
+      if (offset.ult(midpoint))
+        return x_old;
+      else
+        return x_old + 1;
+    } else
+      assert(0 && "Error in ap_private<_AP_W, _AP_S>::sqrt computation");
+    return x_old + 1;
+  }
+
+  ///
+  /// @Assignment Operators
+  ///
+  /// @returns *this after assignment of RHS.
+  /// @brief Copy assignment operator.
+  INLINE ap_private& operator=(const ap_private& RHS) {
+    if (this != &RHS) memcpy(pVal, RHS.get_pVal(), _AP_N * APINT_WORD_SIZE);
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE ap_private& operator=(const volatile ap_private& RHS) {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+    return *this;
+  }
+  INLINE void operator=(const ap_private& RHS) volatile {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+  }
+  INLINE void operator=(const volatile ap_private& RHS) volatile {
+    if (this != &RHS)
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = RHS.get_pVal(i);
+    clearUnusedBits();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1>& RHS) {
+    if (_AP_S1)
+      cpSextOrTrunc(RHS);
+    else
+      cpZextOrTrunc(RHS);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1>& RHS) {
+    if (_AP_S1)
+      cpSextOrTrunc(RHS);
+    else
+      cpZextOrTrunc(RHS);
+    clearUnusedBits();
+    return *this;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE ap_private& operator=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    *this = ap_private<_AP_W2, false>(op2);
+    return *this;
+  }
+
+#if 0
+    template<int _AP_W1, bool _AP_S1>
+    INLINE ap_private& operator=(const ap_private<_AP_W1, _AP_S1, true>& RHS) {
+        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
+        if (RHS.isNegative()) {
+            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
+            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
+        } else {
+            pVal[0] = RHS.get_VAL();
+            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
+        }
+        clearUnusedBits();
+        return *this;
+    }
+
+    template<int _AP_W1, bool _AP_S1>
+    INLINE ap_private& operator=(const volatile ap_private<_AP_W1, _AP_S1, true>& RHS) {
+        static const uint64_t that_sign_ext_mask = (_AP_W1==APINT_BITS_PER_WORD)?0:~0ULL>>(_AP_W1%APINT_BITS_PER_WORD)<<(_AP_W1%APINT_BITS_PER_WORD);
+        if (RHS.isNegative()) {
+            pVal[0] = RHS.get_VAL() | that_sign_ext_mask;
+            memset(pVal+1,~0, APINT_WORD_SIZE*(_AP_N-1));
+        } else {
+            pVal[0] = RHS.get_VAL();
+            memset(pVal+1, 0, APINT_WORD_SIZE*(_AP_N-1));
+        }
+        clearUnusedBits();
+        return *this;
+    }
+#endif
+
+/// from all c types.
+#define ASSIGN_OP_FROM_INT(C_TYPE, _AP_W2, _AP_S2) \
+  INLINE ap_private& operator=(const C_TYPE rhs) { \
+    ap_private<(_AP_W2), (_AP_S2)> tmp = rhs;      \
+    operator=(tmp);                                \
+    return *this;                                  \
+  }
+
+  ASSIGN_OP_FROM_INT(bool, 1, false)
+  ASSIGN_OP_FROM_INT(char, 8, CHAR_IS_SIGNED)
+  ASSIGN_OP_FROM_INT(signed char, 8, true)
+  ASSIGN_OP_FROM_INT(unsigned char, 8, false)
+  ASSIGN_OP_FROM_INT(short, sizeof(short) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned short, sizeof(unsigned short) * 8, false)
+  ASSIGN_OP_FROM_INT(int, sizeof(int) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned int, sizeof(unsigned int) * 8, false)
+  ASSIGN_OP_FROM_INT(long, sizeof(long) * 8, true)
+  ASSIGN_OP_FROM_INT(unsigned long, sizeof(unsigned long) * 8, false)
+  ASSIGN_OP_FROM_INT(ap_slong, sizeof(ap_slong) * 8, true)
+  ASSIGN_OP_FROM_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef ASSIGN_OP_FROM_INT
+
+  /// from c string.
+  // XXX this is a must, to prevent pointer being converted to bool.
+  INLINE ap_private& operator=(const char* s) {
+    ap_private tmp(s); // XXX direct initialization, as ctor is explicit.
+    operator=(tmp);
+    return *this;
+  }
+
+  ///
+  /// @name Unary Operators
+  ///
+  /// @returns a new ap_private value representing *this incremented by one
+  /// @brief Postfix increment operator.
+  INLINE const ap_private operator++(int) {
+    ap_private API(*this);
+    ++(*this);
+    return API;
+  }
+
+  /// @returns *this incremented by one
+  /// @brief Prefix increment operator.
+  INLINE ap_private& operator++() {
+    ap_private_ops::add_1(pVal, pVal, _AP_N, 1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// @returns a new ap_private representing *this decremented by one.
+  /// @brief Postfix decrement operator.
+  INLINE const ap_private operator--(int) {
+    ap_private API(*this);
+    --(*this);
+    return API;
+  }
+
+  /// @returns *this decremented by one.
+  /// @brief Prefix decrement operator.
+  INLINE ap_private& operator--() {
+    ap_private_ops::sub_1(pVal, _AP_N, 1);
+    clearUnusedBits();
+    return *this;
+  }
+
+  /// Performs a bitwise complement operation on this ap_private.
+  /// @returns an ap_private that is the bitwise complement of *this
+  /// @brief Unary bitwise complement operator.
+  INLINE ap_private<_AP_W + !_AP_S, true> operator~() const {
+    ap_private<_AP_W + !_AP_S, true> Result(*this);
+    Result.flip();
+    return Result;
+  }
+
+  /// Negates *this using two's complement logic.
+  /// @returns An ap_private value representing the negation of *this.
+  /// @brief Unary negation operator
+  INLINE typename RType<1, false>::minus operator-() const {
+    return ap_private<1, false>(0) - (*this);
+  }
+
+  /// Performs logical negation operation on this ap_private.
+  /// @returns true if *this is zero, false otherwise.
+  /// @brief Logical negation operator.
+  INLINE bool operator!() const {
+    for (int i = 0; i < _AP_N; ++i)
+      if (pVal[i]) return false;
+    return true;
+  }
+
+  template <bool _AP_S1>
+  INLINE ap_private<_AP_W, _AP_S || _AP_S1> And(
+      const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator&(RHS);
+  }
+  template <bool _AP_S1>
+  INLINE ap_private Or(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator|(RHS);
+  }
+  template <bool _AP_S1>
+  INLINE ap_private Xor(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return this->operator^(RHS);
+  }
+
+  INLINE ap_private Mul(const ap_private& RHS) const {
+    ap_private Result(*this);
+    Result *= RHS;
+    return Result;
+  }
+
+  INLINE ap_private Add(const ap_private& RHS) const {
+    ap_private Result(0);
+    ap_private_ops::add(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
+                        _AP_N, _AP_S, _AP_S);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  INLINE ap_private Sub(const ap_private& RHS) const {
+    ap_private Result(0);
+    ap_private_ops::sub(Result.get_pVal(), pVal, RHS.get_pVal(), _AP_N, _AP_N,
+                        _AP_N, _AP_S, _AP_S);
+    Result.clearUnusedBits();
+    return Result;
+  }
+
+  /// Arithmetic right-shift this ap_private by shiftAmt.
+  /// @brief Arithmetic right-shift function.
+  INLINE ap_private ashr(uint32_t shiftAmt) const {
+    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
+    // Handle a degenerate case
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // If all the bits were shifted out, the result is, technically, undefined.
+    // We return -1 if it was negative, 0 otherwise. We check this early to
+    // avoid
+    // issues in the algorithm below.
+    if (shiftAmt == BitWidth) {
+      if (isNegative())
+        return ap_private(-1);
+      else
+        return ap_private(0);
+    }
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+
+    // Compute some values needed by the following shift algorithms
+    uint32_t wordShift =
+        shiftAmt % APINT_BITS_PER_WORD;               // bits to shift per word
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD; // word offset for shift
+    uint32_t breakWord = _AP_N - 1 - offset;          // last word affected
+    uint32_t bitsInWord = whichBit(BitWidth); // how many bits in last word?
+    if (bitsInWord == 0) bitsInWord = APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      // Move the words containing significant bits
+      for (uint32_t i = 0; i <= breakWord; ++i)
+        val[i] = pVal[i + offset]; // move whole word
+
+      // Adjust the top significant word for sign bit fill, if negative
+      if (isNegative())
+        if (bitsInWord < APINT_BITS_PER_WORD)
+          val[breakWord] |= ~0ULL << (bitsInWord); // set high bits
+    } else {
+      // Shift the low order words
+      for (uint32_t i = 0; i < breakWord; ++i) {
+        // This combines the shifted corresponding word with the low bits from
+        // the next word (shifted into this word's high bits).
+        val[i] = ((pVal[i + offset]) >> (wordShift));
+        val[i] |= ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
+      }
+
+      // Shift the break word. In this case there are no bits from the next word
+      // to include in this word.
+      val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
+
+      // Deal with sign extenstion in the break word, and possibly the word
+      // before
+      // it.
+      if (isNegative()) {
+        if (wordShift > bitsInWord) {
+          if (breakWord > 0)
+            val[breakWord - 1] |=
+                ~0ULL << (APINT_BITS_PER_WORD - (wordShift - bitsInWord));
+          val[breakWord] |= ~0ULL;
+        } else
+          val[breakWord] |= (~0ULL << (bitsInWord - wordShift));
+      }
+    }
+
+    // Remaining words are 0 or -1, just assign them.
+    uint64_t fillValue = (isNegative() ? ~0ULL : 0);
+    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = fillValue;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  /// Logical right-shift this ap_private by shiftAmt.
+  /// @brief Logical right-shift function.
+  INLINE ap_private lshr(uint32_t shiftAmt) const {
+    // If all the bits were shifted out, the result is 0. This avoids issues
+    // with shifting by the size of the integer type, which produces undefined
+    // results. We define these "undefined results" to always be 0.
+    if (shiftAmt == BitWidth) return ap_private(0);
+
+    // If none of the bits are shifted out, the result is *this. This avoids
+    // issues with shifting byt he size of the integer type, which produces
+    // undefined results in the code below. This is also an optimization.
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+
+    // If we are shifting less than a word, compute the shift with a simple
+    // carry
+    if (shiftAmt < APINT_BITS_PER_WORD) {
+      uint64_t carry = 0;
+      for (int i = _AP_N - 1; i >= 0; --i) {
+        val[i] = ((pVal[i]) >> (shiftAmt)) | carry;
+        carry = (pVal[i]) << (APINT_BITS_PER_WORD - shiftAmt);
+      }
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Compute some values needed by the remaining shift algorithms
+    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      for (uint32_t i = 0; i < _AP_N - offset; ++i) val[i] = pVal[i + offset];
+      for (uint32_t i = _AP_N - offset; i < _AP_N; i++) val[i] = 0;
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Shift the low order words
+    uint32_t breakWord = _AP_N - offset - 1;
+    for (uint32_t i = 0; i < breakWord; ++i)
+      val[i] = ((pVal[i + offset]) >> (wordShift)) |
+               ((pVal[i + offset + 1]) << (APINT_BITS_PER_WORD - wordShift));
+    // Shift the break word.
+    val[breakWord] = (pVal[breakWord + offset]) >> (wordShift);
+
+    // Remaining words are 0
+    for (int i = breakWord + 1; i < _AP_N; ++i) val[i] = 0;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  /// Left-shift this ap_private by shiftAmt.
+  /// @brief Left-shift function.
+  INLINE ap_private shl(uint32_t shiftAmt) const {
+    assert(shiftAmt <= BitWidth && "Invalid shift amount, too big");
+    // If all the bits were shifted out, the result is 0. This avoids issues
+    // with shifting by the size of the integer type, which produces undefined
+    // results. We define these "undefined results" to always be 0.
+    if (shiftAmt == BitWidth) return ap_private(0);
+
+    // If none of the bits are shifted out, the result is *this. This avoids a
+    // lshr by the words size in the loop below which can produce incorrect
+    // results. It also avoids the expensive computation below for a common
+    // case.
+    if (shiftAmt == 0) return ap_private(*this);
+
+    // Create some space for the result.
+    ap_private Retval(0);
+    uint64_t* val = Retval.get_pVal();
+    // If we are shifting less than a word, do it the easy way
+    if (shiftAmt < APINT_BITS_PER_WORD) {
+      uint64_t carry = 0;
+      for (int i = 0; i < _AP_N; i++) {
+        val[i] = ((pVal[i]) << (shiftAmt)) | carry;
+        carry = (pVal[i]) >> (APINT_BITS_PER_WORD - shiftAmt);
+      }
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Compute some values needed by the remaining shift algorithms
+    uint32_t wordShift = shiftAmt % APINT_BITS_PER_WORD;
+    uint32_t offset = shiftAmt / APINT_BITS_PER_WORD;
+
+    // If we are shifting whole words, just move whole words
+    if (wordShift == 0) {
+      for (uint32_t i = 0; i < offset; i++) val[i] = 0;
+      for (int i = offset; i < _AP_N; i++) val[i] = pVal[i - offset];
+      Retval.clearUnusedBits();
+      return Retval;
+    }
+
+    // Copy whole words from this to Result.
+    uint32_t i = _AP_N - 1;
+    for (; i > offset; --i)
+      val[i] = (pVal[i - offset]) << (wordShift) |
+               (pVal[i - offset - 1]) >> (APINT_BITS_PER_WORD - wordShift);
+    val[offset] = (pVal[0]) << (wordShift);
+    for (i = 0; i < offset; ++i) val[i] = 0;
+    Retval.clearUnusedBits();
+    return Retval;
+  }
+
+  INLINE ap_private rotl(uint32_t rotateAmt) const {
+    if (rotateAmt == 0) return ap_private(*this);
+    // Don't get too fancy, just use existing shift/or facilities
+    ap_private hi(*this);
+    ap_private lo(*this);
+    hi.shl(rotateAmt);
+    lo.lshr(BitWidth - rotateAmt);
+    return hi | lo;
+  }
+
+  INLINE ap_private rotr(uint32_t rotateAmt) const {
+    if (rotateAmt == 0) return ap_private(*this);
+    // Don't get too fancy, just use existing shift/or facilities
+    ap_private hi(*this);
+    ap_private lo(*this);
+    lo.lshr(rotateAmt);
+    hi.shl(BitWidth - rotateAmt);
+    return hi | lo;
+  }
+
+  /// Perform an unsigned divide operation on this ap_private by RHS. Both this
+  /// and
+  /// RHS are treated as unsigned quantities for purposes of this division.
+  /// @returns a new ap_private value containing the division result
+  /// @brief Unsigned division operation.
+  INLINE ap_private udiv(const ap_private& RHS) const {
+    // Get some facts about the LHS and RHS number of bits and words
+    uint32_t rhsBits = RHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Divided by zero???");
+    uint32_t lhsBits = this->getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+    // Deal with some degenerate cases
+    if (!lhsWords)
+      // 0 / X ===> 0
+      return ap_private(0);
+    else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X / Y ===> 0, iff X < Y
+      return ap_private(0);
+    } else if (*this == RHS) {
+      // X / X ===> 1
+      return ap_private(1);
+    } else if (lhsWords == 1 && rhsWords == 1) {
+      // All high words are zero, just use native divide
+      return ap_private(this->pVal[0] / RHS.get_pVal(0));
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Quotient(0); // to hold result.
+    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, &Quotient,
+                           (ap_private*)0);
+    return Quotient;
+  }
+
+  /// Signed divide this ap_private by ap_private RHS.
+  /// @brief Signed division function for ap_private.
+  INLINE ap_private sdiv(const ap_private& RHS) const {
+    if (isNegative())
+      if (RHS.isNegative())
+        return (-(*this)).udiv(-RHS);
+      else
+        return -((-(*this)).udiv(RHS));
+    else if (RHS.isNegative())
+      return -(this->udiv((ap_private)(-RHS)));
+    return this->udiv(RHS);
+  }
+
+  /// Perform an unsigned remainder operation on this ap_private with RHS being
+  /// the
+  /// divisor. Both this and RHS are treated as unsigned quantities for purposes
+  /// of this operation. Note that this is a true remainder operation and not
+  /// a modulo operation because the sign follows the sign of the dividend
+  /// which is *this.
+  /// @returns a new ap_private value containing the remainder result
+  /// @brief Unsigned remainder operation.
+  INLINE ap_private urem(const ap_private& RHS) const {
+    // Get some facts about the LHS
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+
+    // Get some facts about the RHS
+    uint32_t rhsBits = RHS.getActiveBits();
+    uint32_t rhsWords = !rhsBits ? 0 : (whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Performing remainder operation by zero ???");
+
+    // Check the degenerate cases
+    if (lhsWords == 0) {
+      // 0 % Y ===> 0
+      return ap_private(0);
+    } else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X % Y ===> X, iff X < Y
+      return *this;
+    } else if (*this == RHS) {
+      // X % X == 0;
+      return ap_private(0);
+    } else if (lhsWords == 1) {
+      // All high words are zero, just use native remainder
+      return ap_private(pVal[0] % RHS.get_pVal(0));
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Remainder(0);
+    ap_private_ops::divide(*this, lhsWords, RHS, rhsWords, (ap_private*)(0),
+                           &Remainder);
+    return Remainder;
+  }
+
+  INLINE ap_private urem(uint64_t RHS) const {
+    // Get some facts about the LHS
+    uint32_t lhsBits = getActiveBits();
+    uint32_t lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1);
+    // Get some facts about the RHS
+    uint32_t rhsWords = 1; //! rhsBits ? 0 : (ap_private<_AP_W,
+                           //! _AP_S>::whichWord(rhsBits - 1) + 1);
+    assert(rhsWords && "Performing remainder operation by zero ???");
+    // Check the degenerate cases
+    if (lhsWords == 0) {
+      // 0 % Y ===> 0
+      return ap_private(0);
+    } else if (lhsWords < rhsWords || this->ult(RHS)) {
+      // X % Y ===> X, iff X < Y
+      return *this;
+    } else if (*this == RHS) {
+      // X % X == 0;
+      return ap_private(0);
+    } else if (lhsWords == 1) {
+      // All high words are zero, just use native remainder
+      return ap_private(pVal[0] % RHS);
+    }
+
+    // We have to compute it the hard way. Invoke the Knuth divide algorithm.
+    ap_private Remainder(0);
+    divide(*this, lhsWords, RHS, (ap_private*)(0), &Remainder);
+    return Remainder;
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  INLINE ap_private srem(const ap_private& RHS) const {
+    if (isNegative()) {
+      ap_private lhs = -(*this);
+      if (RHS.isNegative()) {
+        ap_private rhs = -RHS;
+        return -(lhs.urem(rhs));
+      } else
+        return -(lhs.urem(RHS));
+    } else if (RHS.isNegative()) {
+      ap_private rhs = -RHS;
+      return this->urem(rhs);
+    }
+    return this->urem(RHS);
+  }
+
+  /// Signed remainder operation on ap_private.
+  /// @brief Function for signed remainder operation.
+  INLINE ap_private srem(int64_t RHS) const {
+    if (isNegative())
+      if (RHS < 0)
+        return -((-(*this)).urem(-RHS));
+      else
+        return -((-(*this)).urem(RHS));
+    else if (RHS < 0)
+      return this->urem(-RHS);
+    return this->urem(RHS);
+  }
+
+  /// Compares this ap_private with RHS for the validity of the equality
+  /// relationship.
+  /// @returns true if *this == Val
+  /// @brief Equality comparison.
+  template <bool _AP_S1>
+  INLINE bool eq(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return (*this) == RHS;
+  }
+
+  /// Compares this ap_private with RHS for the validity of the inequality
+  /// relationship.
+  /// @returns true if *this != Val
+  /// @brief Inequality comparison
+  template <bool _AP_S1>
+  INLINE bool ne(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !((*this) == RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the less-than relationship.
+  /// @returns true if *this < RHS when both are considered unsigned.
+  /// @brief Unsigned less than comparison
+  template <bool _AP_S1>
+  INLINE bool ult(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    // Get active bit length of both operands
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 = RHS.getActiveBits();
+
+    // If magnitude of LHS is less than RHS, return true.
+    if (n1 < n2) return true;
+
+    // If magnitude of RHS is greather than LHS, return false.
+    if (n2 < n1) return false;
+
+    // If they bot fit in a word, just compare the low order word
+    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+      return pVal[0] < RHS.get_pVal(0);
+
+    // Otherwise, compare all words
+    uint32_t topWord = whichWord(AESL_std::max(n1, n2) - 1);
+    for (int i = topWord; i >= 0; --i) {
+      if (pVal[i] > RHS.get_pVal(i)) return false;
+      if (pVal[i] < RHS.get_pVal(i)) return true;
+    }
+    return false;
+  }
+
+  INLINE bool ult(uint64_t RHS) const {
+    // Get active bit length of both operands
+    uint32_t n1 = getActiveBits();
+    uint32_t n2 =
+        64 - ap_private_ops::CountLeadingZeros_64(RHS); // RHS.getActiveBits();
+
+    // If magnitude of LHS is less than RHS, return true.
+    if (n1 < n2) return true;
+
+    // If magnitude of RHS is greather than LHS, return false.
+    if (n2 < n1) return false;
+
+    // If they bot fit in a word, just compare the low order word
+    if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
+      return pVal[0] < RHS;
+    assert(0);
+  }
+
+  template <bool _AP_S1>
+  INLINE bool slt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    ap_private lhs(*this);
+    ap_private<_AP_W, _AP_S1> rhs(RHS);
+    bool lhsNeg = isNegative();
+    bool rhsNeg = rhs.isNegative();
+    if (lhsNeg) {
+      // Sign bit is set so perform two's complement to make it positive
+      lhs.flip();
+      lhs++;
+    }
+    if (rhsNeg) {
+      // Sign bit is set so perform two's complement to make it positive
+      rhs.flip();
+      rhs++;
+    }
+
+    // Now we have unsigned values to compare so do the comparison if necessary
+    // based on the negativeness of the values.
+    if (lhsNeg)
+      if (rhsNeg)
+        return lhs.ugt(rhs);
+      else
+        return true;
+    else if (rhsNeg)
+      return false;
+    else
+      return lhs.ult(rhs);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered unsigned.
+  /// @brief Unsigned less or equal comparison
+  template <bool _AP_S1>
+  INLINE bool ule(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return ult(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the less-or-equal relationship.
+  /// @returns true if *this <= RHS when both are considered signed.
+  /// @brief Signed less or equal comparison
+  template <bool _AP_S1>
+  INLINE bool sle(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return slt(RHS) || eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered unsigned.
+  /// @brief Unsigned greather than comparison
+  template <bool _AP_S1>
+  INLINE bool ugt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !ult(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// the validity of the greater-than relationship.
+  /// @returns true if *this > RHS when both are considered signed.
+  /// @brief Signed greather than comparison
+  template <bool _AP_S1>
+  INLINE bool sgt(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !slt(RHS) && !eq(RHS);
+  }
+
+  /// Regards both *this and RHS as unsigned quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered unsigned.
+  /// @brief Unsigned greater or equal comparison
+  template <bool _AP_S1>
+  INLINE bool uge(const ap_private<_AP_W, _AP_S>& RHS) const {
+    return !ult(RHS);
+  }
+
+  /// Regards both *this and RHS as signed quantities and compares them for
+  /// validity of the greater-or-equal relationship.
+  /// @returns true if *this >= RHS when both are considered signed.
+  /// @brief Signed greather or equal comparison
+  template <bool _AP_S1>
+  INLINE bool sge(const ap_private<_AP_W, _AP_S1>& RHS) const {
+    return !slt(RHS);
+  }
+
+  // Sign extend to a new width.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpSext(const ap_private<_AP_W1, _AP_S1>& that) {
+    assert(_AP_W1 < BitWidth && "Invalid ap_private SignExtend request");
+    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
+    // If the sign bit isn't set, this is the same as zext.
+    if (!that.isNegative()) {
+      cpZext(that);
+      return;
+    }
+
+    // The sign bit is set. First, get some facts
+    enum { wordBits = _AP_W1 % APINT_BITS_PER_WORD };
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    // Mask the high order word appropriately
+    if (_AP_N1 == _AP_N) {
+      enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
+      // The extension is contained to the wordsBefore-1th word.
+      static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      pVal[_AP_N - 1] |= mask;
+      return;
+    }
+
+    enum { newWordBits = _AP_W % APINT_BITS_PER_WORD };
+    // The extension is contained to the wordsBefore-1th word.
+    static const uint64_t mask = wordBits ? (~0ULL << (wordBits)) : 0ULL;
+    int i;
+    for (i = 0; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
+    pVal[i - 1] |= mask;
+    for (; i < _AP_N - 1; i++) pVal[i] = ~0ULL;
+    pVal[i] = ~0ULL;
+    clearUnusedBits();
+    return;
+  }
+
+  //  Zero extend to a new width.
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpZext(const ap_private<_AP_W1, _AP_S1>& that) {
+    assert(_AP_W1 < BitWidth && "Invalid ap_private ZeroExtend request");
+    assert(_AP_W1 <= MAX_INT_BITS && "Too many bits");
+    const int _AP_N1 = ap_private<_AP_W1, _AP_S1>::_AP_N;
+    int i = 0;
+    for (; i < _AP_N1; ++i) pVal[i] = that.get_pVal(i);
+    for (; i < _AP_N; ++i) pVal[i] = 0;
+    clearUnusedBits();
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpZextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
+    if (BitWidth > _AP_W1)
+      cpZext(that);
+    else {
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      clearUnusedBits();
+    }
+  }
+
+  template <int _AP_W1, bool _AP_S1>
+  INLINE void cpSextOrTrunc(const ap_private<_AP_W1, _AP_S1>& that) {
+    if (BitWidth > _AP_W1)
+      cpSext(that);
+    else {
+      for (int i = 0; i < _AP_N; ++i) pVal[i] = that.get_pVal(i);
+      clearUnusedBits();
+    }
+  }
+
+  /// @}
+  /// @name Value Characterization Functions
+  /// @{
+
+  /// @returns the total number of bits.
+  INLINE uint32_t getBitWidth() const { return BitWidth; }
+
+  /// Here one word's bitwidth equals to that of uint64_t.
+  /// @returns the number of words to hold the integer value of this ap_private.
+  /// @brief Get the number of words.
+  INLINE uint32_t getNumWords() const {
+    return (BitWidth + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
+  }
+
+  /// This function returns the number of active bits which is defined as the
+  /// bit width minus the number of leading zeros. This is used in several
+  /// computations to see how "wide" the value is.
+  /// @brief Compute the number of active bits in the value
+  INLINE uint32_t getActiveBits() const {
+    uint32_t bits = BitWidth - countLeadingZeros();
+    return bits ? bits : 1;
+  }
+
+  /// This method attempts to return the value of this ap_private as a zero
+  /// extended
+  /// uint64_t. The bitwidth must be <= 64 or the value must fit within a
+  /// uint64_t. Otherwise an assertion will result.
+  /// @brief Get zero extended value
+  INLINE uint64_t getZExtValue() const {
+    assert(getActiveBits() <= 64 && "Too many bits for uint64_t");
+    return *pVal;
+  }
+
+  /// This method attempts to return the value of this ap_private as a sign
+  /// extended
+  /// int64_t. The bit width must be <= 64 or the value must fit within an
+  /// int64_t. Otherwise an assertion will result.
+  /// @brief Get sign extended value
+  INLINE int64_t getSExtValue() const {
+    assert(getActiveBits() <= 64 && "Too many bits for int64_t");
+    return int64_t(pVal[0]);
+  }
+
+  /// This method determines how many bits are required to hold the ap_private
+  /// equivalent of the string given by \p str of length \p slen.
+  /// @brief Get bits required for string value.
+  INLINE static uint32_t getBitsNeeded(const char* str, uint32_t slen,
+                                       uint8_t radix) {
+    assert(str != 0 && "Invalid value string");
+    assert(slen > 0 && "Invalid string length");
+
+    // Each computation below needs to know if its negative
+    uint32_t isNegative = str[0] == '-';
+    if (isNegative) {
+      slen--;
+      str++;
+    }
+    // For radixes of power-of-two values, the bits required is accurately and
+    // easily computed
+    if (radix == 2) return slen + isNegative;
+    if (radix == 8) return slen * 3 + isNegative;
+    if (radix == 16) return slen * 4 + isNegative;
+
+    // Otherwise it must be radix == 10, the hard case
+    assert(radix == 10 && "Invalid radix");
+
+    // Convert to the actual binary value.
+    // ap_private<_AP_W, _AP_S> tmp(sufficient, str, slen, radix);
+
+    // Compute how many bits are required.
+    // return isNegative + tmp.logBase2() + 1;
+    return isNegative + slen * 4;
+  }
+
+  /// countLeadingZeros - This function is an ap_private version of the
+  /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number
+  /// of zeros from the most significant bit to the first one bit.
+  /// @returns BitWidth if the value is zero.
+  /// @returns the number of zeros from the most significant bit to the first
+  /// one bits.
+  INLINE uint32_t countLeadingZeros() const {
+    enum {
+      msw_bits = (BitWidth % APINT_BITS_PER_WORD)
+                     ? (BitWidth % APINT_BITS_PER_WORD)
+                     : APINT_BITS_PER_WORD,
+      excessBits = APINT_BITS_PER_WORD - msw_bits
+    };
+    uint32_t Count = ap_private_ops::CountLeadingZeros_64(pVal[_AP_N - 1]);
+    if (Count >= excessBits) Count -= excessBits;
+    if (!pVal[_AP_N - 1]) {
+      for (int i = _AP_N - 1; i; --i) {
+        if (!pVal[i - 1])
+          Count += APINT_BITS_PER_WORD;
+        else {
+          Count += ap_private_ops::CountLeadingZeros_64(pVal[i - 1]);
+          break;
+        }
+      }
+    }
+    return Count;
+  }
+
+  /// countLeadingOnes - This function counts the number of contiguous 1 bits
+  /// in the high order bits. The count stops when the first 0 bit is reached.
+  /// @returns 0 if the high order bit is not set
+  /// @returns the number of 1 bits from the most significant to the least
+  /// @brief Count the number of leading one bits.
+  INLINE uint32_t countLeadingOnes() const {
+    if (isSingleWord())
+      return countLeadingOnes_64(get_VAL(), APINT_BITS_PER_WORD - BitWidth);
+
+    uint32_t highWordBits = BitWidth % APINT_BITS_PER_WORD;
+    uint32_t shift =
+        (highWordBits == 0 ? 0 : APINT_BITS_PER_WORD - highWordBits);
+    int i = _AP_N - 1;
+    uint32_t Count = countLeadingOnes_64(get_pVal(i), shift);
+    if (Count == highWordBits) {
+      for (i--; i >= 0; --i) {
+        if (get_pVal(i) == ~0ULL)
+          Count += APINT_BITS_PER_WORD;
+        else {
+          Count += countLeadingOnes_64(get_pVal(i), 0);
+          break;
+        }
+      }
+    }
+    return Count;
+  }
+
+  /// countTrailingZeros - This function is an ap_private version of the
+  /// countTrailingZoers_{32,64} functions in MathExtras.h. It counts
+  /// the number of zeros from the least significant bit to the first set bit.
+  /// @returns BitWidth if the value is zero.
+  /// @returns the number of zeros from the least significant bit to the first
+  /// one bit.
+  /// @brief Count the number of trailing zero bits.
+  INLINE uint32_t countTrailingZeros() const {
+    uint32_t Count = 0;
+    uint32_t i = 0;
+    for (; i < _AP_N && get_pVal(i) == 0; ++i) Count += APINT_BITS_PER_WORD;
+    if (i < _AP_N) Count += ap_private_ops::CountTrailingZeros_64(get_pVal(i));
+    return AESL_std::min(Count, BitWidth);
+  }
+  /// countPopulation - This function is an ap_private version of the
+  /// countPopulation_{32,64} functions in MathExtras.h. It counts the number
+  /// of 1 bits in the ap_private value.
+  /// @returns 0 if the value is zero.
+  /// @returns the number of set bits.
+  /// @brief Count the number of bits set.
+  INLINE uint32_t countPopulation() const {
+    uint32_t Count = 0;
+    for (int i = 0; i < _AP_N - 1; ++i)
+      Count += ap_private_ops::CountPopulation_64(pVal[i]);
+    Count += ap_private_ops::CountPopulation_64(pVal[_AP_N - 1] & mask);
+    return Count;
+  }
+
+  /// @}
+  /// @name Conversion Functions
+  /// @
+
+  /// This is used internally to convert an ap_private to a string.
+  /// @brief Converts an ap_private to a std::string
+  INLINE std::string toString(uint8_t radix, bool wantSigned) const;
+
+  /// Considers the ap_private to be unsigned and converts it into a string in
+  /// the
+  /// radix given. The radix can be 2, 8, 10 or 16.
+  /// @returns a character interpretation of the ap_private
+  /// @brief Convert unsigned ap_private to string representation.
+  INLINE std::string toStringUnsigned(uint8_t radix = 10) const {
+    return toString(radix, false);
+  }
+
+  /// Considers the ap_private to be unsigned and converts it into a string in
+  /// the
+  /// radix given. The radix can be 2, 8, 10 or 16.
+  /// @returns a character interpretation of the ap_private
+  /// @brief Convert unsigned ap_private to string representation.
+  INLINE std::string toStringSigned(uint8_t radix = 10) const {
+    return toString(radix, true);
+  }
+
+  /// @brief Converts this ap_private to a double value.
+  INLINE double roundToDouble(bool isSigned) const {
+    // Handle the simple case where the value is contained in one uint64_t.
+    if (isSingleWord() || getActiveBits() <= APINT_BITS_PER_WORD) {
+      uint64_t val = pVal[0];
+      if (isSigned) {
+        int64_t sext = ((int64_t(val)) << (64 - BitWidth)) >> (64 - BitWidth);
+        return double(sext);
+      } else
+        return double(val);
+    }
+
+    // Determine if the value is negative.
+    bool isNeg = isSigned ? (*this)[BitWidth - 1] : false;
+
+    // Construct the absolute value if we're negative.
+    ap_private<_AP_W, _AP_S> Tmp(isNeg ? -(*this) : (*this));
+
+    // Figure out how many bits we're using.
+    uint32_t n = Tmp.getActiveBits();
+
+    // The exponent (without bias normalization) is just the number of bits
+    // we are using. Note that the sign bit is gone since we constructed the
+    // absolute value.
+    uint64_t exp = n;
+
+    // Return infinity for exponent overflow
+    if (exp > 1023) {
+      if (!isSigned || !isNeg)
+        return std::numeric_limits<double>::infinity();
+      else
+        return -std::numeric_limits<double>::infinity();
+    }
+    exp += 1023; // Increment for 1023 bias
+
+    // Number of bits in mantissa is 52. To obtain the mantissa value, we must
+    // extract the high 52 bits from the correct words in pVal.
+    uint64_t mantissa;
+    unsigned hiWord = whichWord(n - 1);
+    if (hiWord == 0) {
+      mantissa = Tmp.get_pVal(0);
+      if (n > 52)
+        (mantissa) >>= (n - 52); // shift down, we want the top 52 bits.
+    } else {
+      assert(hiWord > 0 && "High word is negative?");
+      uint64_t hibits = (Tmp.get_pVal(hiWord))
+                        << (52 - n % APINT_BITS_PER_WORD);
+      uint64_t lobits =
+          (Tmp.get_pVal(hiWord - 1)) >> (11 + n % APINT_BITS_PER_WORD);
+      mantissa = hibits | lobits;
+    }
+
+    // The leading bit of mantissa is implicit, so get rid of it.
+    uint64_t sign = isNeg ? (1ULL << (APINT_BITS_PER_WORD - 1)) : 0;
+    union {
+      double __D;
+      uint64_t __I;
+    } __T;
+    __T.__I = sign | ((exp) << 52) | mantissa;
+    return __T.__D;
+  }
+
+  /// @brief Converts this unsigned ap_private to a double value.
+  INLINE double roundToDouble() const { return roundToDouble(false); }
+
+  /// @brief Converts this signed ap_private to a double value.
+  INLINE double signedRoundToDouble() const { return roundToDouble(true); }
+
+  /// The conversion does not do a translation from integer to double, it just
+  /// re-interprets the bits as a double. Note that it is valid to do this on
+  /// any bit width. Exactly 64 bits will be translated.
+  /// @brief Converts ap_private bits to a double
+  INLINE double bitsToDouble() const {
+    union {
+      uint64_t __I;
+      double __D;
+    } __T;
+    __T.__I = pVal[0];
+    return __T.__D;
+  }
+
+  /// The conversion does not do a translation from integer to float, it just
+  /// re-interprets the bits as a float. Note that it is valid to do this on
+  /// any bit width. Exactly 32 bits will be translated.
+  /// @brief Converts ap_private bits to a double
+  INLINE float bitsToFloat() const {
+    union {
+      uint32_t __I;
+      float __F;
+    } __T;
+    __T.__I = uint32_t(pVal[0]);
+    return __T.__F;
+  }
+
+  /// The conversion does not do a translation from double to integer, it just
+  /// re-interprets the bits of the double. Note that it is valid to do this on
+  /// any bit width but bits from V may get truncated.
+  /// @brief Converts a double to ap_private bits.
+  INLINE ap_private& doubleToBits(double __V) {
+    union {
+      uint64_t __I;
+      double __D;
+    } __T;
+    __T.__D = __V;
+    pVal[0] = __T.__I;
+    return *this;
+  }
+
+  /// The conversion does not do a translation from float to integer, it just
+  /// re-interprets the bits of the float. Note that it is valid to do this on
+  /// any bit width but bits from V may get truncated.
+  /// @brief Converts a float to ap_private bits.
+  INLINE ap_private& floatToBits(float __V) {
+    union {
+      uint32_t __I;
+      float __F;
+    } __T;
+    __T.__F = __V;
+    pVal[0] = __T.__I;
+  }
+
+  // Reduce operation
+  //-----------------------------------------------------------
+  INLINE bool and_reduce() const { return isMaxValue(); }
+
+  INLINE bool nand_reduce() const { return isMinValue(); }
+
+  INLINE bool or_reduce() const { return (bool)countPopulation(); }
+
+  INLINE bool nor_reduce() const { return countPopulation() == 0; }
+
+  INLINE bool xor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? true : false;
+  }
+
+  INLINE bool xnor_reduce() const {
+    unsigned int i = countPopulation();
+    return (i % 2) ? false : true;
+  }
+  INLINE std::string to_string(uint8_t radix = 16, bool sign = false) const {
+    return toString(radix, radix == 10 ? _AP_S : sign);
+  }
+}; // End of class ap_private <_AP_W, _AP_S, false>
+
+namespace ap_private_ops {
+
+enum { APINT_BITS_PER_WORD = 64 };
+template <int _AP_W, bool _AP_S>
+INLINE bool operator==(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
+  return V2 == V1;
+}
+
+template <int _AP_W, bool _AP_S>
+INLINE bool operator!=(uint64_t V1, const ap_private<_AP_W, _AP_S>& V2) {
+  return V2 != V1;
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE bool get(const ap_private<_AP_W, _AP_S>& a) {
+  static const uint64_t mask = 1ULL << (index & 0x3f);
+  return ((mask & a.get_pVal((index) >> 6)) != 0);
+}
+
+template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
+INLINE void set(ap_private<_AP_W, _AP_S>& a,
+                const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
+                const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    lsb_word = lsb_index / APINT_BITS_PER_WORD,
+    msb_word = msb_index / APINT_BITS_PER_WORD,
+    msb = msb_index % APINT_BITS_PER_WORD,
+    lsb = lsb_index % APINT_BITS_PER_WORD
+  };
+  if (msb_word == lsb_word) {
+    const uint64_t mask = ~0ULL >>
+                          (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
+                          (APINT_BITS_PER_WORD - msb - 1);
+    // a.set_pVal(msb_word, a.get_pVal(msb_word)  | mask);
+    a.get_pVal(msb_word) |= mask;
+  } else {
+    const uint64_t lsb_mask = ~0ULL >> (lsb) << (lsb);
+    const uint64_t msb_mask = ~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
+                              (APINT_BITS_PER_WORD - msb - 1);
+    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) | lsb_mask);
+    a.get_pVal(lsb_word) |= lsb_mask;
+    for (int i = lsb_word + 1; i < msb_word; i++) {
+      a.set_pVal(i, ~0ULL);
+      // a.get_pVal(i)=0;
+    }
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) | msb_mask);
+
+    a.get_pVal(msb_word) |= msb_mask;
+  }
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int msb_index, int lsb_index>
+INLINE void clear(ap_private<_AP_W, _AP_S>& a,
+                  const ap_private<AP_MAX(msb_index, 1), true>& mark1 = 0,
+                  const ap_private<AP_MAX(lsb_index, 1), true>& mark2 = 0) {
+  enum {
+    APINT_BITS_PER_WORD = 64,
+    lsb_word = lsb_index / APINT_BITS_PER_WORD,
+    msb_word = msb_index / APINT_BITS_PER_WORD,
+    msb = msb_index % APINT_BITS_PER_WORD,
+    lsb = lsb_index % APINT_BITS_PER_WORD
+  };
+  if (msb_word == lsb_word) {
+    const uint64_t mask =
+        ~(~0ULL >> (lsb) << (APINT_BITS_PER_WORD - msb + lsb - 1) >>
+          (APINT_BITS_PER_WORD - msb - 1));
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) & mask);
+    a.get_pVal(msb_word) &= mask;
+  } else {
+    const uint64_t lsb_mask = ~(~0ULL >> (lsb) << (lsb));
+    const uint64_t msb_mask = ~(~0ULL << (APINT_BITS_PER_WORD - msb - 1) >>
+                                (APINT_BITS_PER_WORD - msb - 1));
+    // a.set_pVal(lsb_word, a.get_pVal(lsb_word) & lsb_mask);
+    a.get_pVal(lsb_word) &= lsb_mask;
+    for (int i = lsb_word + 1; i < msb_word; i++) {
+      // a.set_pVal(i, 0);
+      a.get_pVal(i) = 0;
+    }
+    // a.set_pVal(msb_word, a.get_pVal(msb_word) & msb_mask);
+    a.get_pVal(msb_word) &= msb_mask;
+  }
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE void set(ap_private<_AP_W, _AP_S>& a,
+                const ap_private<AP_MAX(index, 1), true>& mark = 0) {
+  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
+  static const uint64_t mask = 1ULL << (index % APINT_BITS_PER_WORD);
+  // a.set_pVal(word, a.get_pVal(word) | mask);
+  a.get_pVal(word) |= mask;
+  a.clearUnusedBits();
+}
+
+template <int _AP_W, bool _AP_S, int index>
+INLINE void clear(ap_private<_AP_W, _AP_S>& a,
+                  const ap_private<AP_MAX(index, 1), true>& mark = 0) {
+  enum { APINT_BITS_PER_WORD = 64, word = index / APINT_BITS_PER_WORD };
+  static const uint64_t mask = ~(1ULL << (index % APINT_BITS_PER_WORD));
+  // a.set_pVal(word, a.get_pVal(word) & mask);
+  a.get_pVal(word) &= mask;
+  a.clearUnusedBits();
+}
+
+} // End of ap_private_ops namespace
+
+template <int _AP_W, bool _AP_S>
+INLINE std::string ap_private<_AP_W, _AP_S, false>::toString(
+    uint8_t radix, bool wantSigned) const {
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2) &&
+         "Radix should be 2, 8, 10, or 16!");
+  static const char* digits[] = {"0", "1", "2", "3", "4", "5", "6", "7",
+                                 "8", "9", "A", "B", "C", "D", "E", "F"};
+  std::string result;
+
+  if (radix != 10) {
+    // For the 2, 8 and 16 bit cases, we can just shift instead of divide
+    // because the number of bits per digit (1,3 and 4 respectively) divides
+    // equaly. We just shift until there value is zero.
+
+    // First, check for a zero value and just short circuit the logic below.
+    if (*this == (uint64_t)(0))
+      result = "0";
+    else {
+      ap_private<_AP_W, false> tmp(*this);
+      size_t insert_at = 0;
+      bool leading_zero = true;
+      if (wantSigned && isNegative()) {
+        // They want to print the signed version and it is a negative value
+        // Flip the bits and add one to turn it into the equivalent positive
+        // value and put a '-' in the result.
+        tmp.flip();
+        tmp++;
+        tmp.clearUnusedBitsToZero();
+        result = "-";
+        insert_at = 1;
+        leading_zero = false;
+      }
+      switch (radix) {
+        case 2:
+          result += "0b";
+          break;
+        case 8:
+          result += "0o";
+          break;
+        case 16:
+          result += "0x";
+          break;
+        default:
+          assert("invalid radix" && 0);
+      }
+      insert_at += 2;
+      // Just shift tmp right for each digit width until it becomes zero
+      uint32_t shift = (radix == 16 ? 4 : (radix == 8 ? 3 : 1));
+      uint64_t mask = radix - 1;
+      ap_private<_AP_W, false> zero(0);
+      unsigned bits = 0;
+      while (tmp.ne(zero)) {
+        uint64_t digit = tmp.get_VAL() & mask;
+        result.insert(insert_at, digits[digit]);
+        tmp = tmp.lshr(shift);
+        ++bits;
+      }
+      bits *= shift;
+      if (bits < _AP_W && leading_zero) result.insert(insert_at, digits[0]);
+    }
+    return result;
+  }
+
+  ap_private<_AP_W, false> tmp(*this);
+  ap_private<_AP_W, false> divisor(radix);
+  ap_private<_AP_W, false> zero(0);
+  size_t insert_at = 0;
+  if (wantSigned && isNegative()) {
+    // They want to print the signed version and it is a negative value
+    // Flip the bits and add one to turn it into the equivalent positive
+    // value and put a '-' in the result.
+    tmp.flip();
+    tmp++;
+    tmp.clearUnusedBitsToZero();
+    result = "-";
+    insert_at = 1;
+  }
+  if (tmp == ap_private<_AP_W, false>(0))
+    result = "0";
+  else
+    while (tmp.ne(zero)) {
+      ap_private<_AP_W, false> APdigit(0);
+      ap_private<_AP_W, false> tmp2(0);
+      ap_private_ops::divide(tmp, tmp.getNumWords(), divisor,
+                             divisor.getNumWords(), &tmp2, &APdigit);
+      uint64_t digit = APdigit.getZExtValue();
+      assert(digit < radix && "divide failed");
+      result.insert(insert_at, digits[digit]);
+      tmp = tmp2;
+    }
+
+  return result;
+} // End of ap_private<_AP_W, _AP_S, false>::toString()
+
+template <int _AP_W, bool _AP_S>
+std::ostream &operator<<(std::ostream &os, const ap_private<_AP_W, _AP_S> &x) {
+  std::ios_base::fmtflags ff = std::cout.flags();
+  if (ff & std::cout.hex) {
+    os << x.toString(16, false); // don't print sign
+  } else if (ff & std::cout.oct) {
+    os << x.toString(8, false); // don't print sign
+  } else {
+    os << x.toString(10, _AP_S);
+  }
+  return os;
+}
+
+// ------------------------------------------------------------ //
+//           XXX moved here from ap_int_sim.h  XXX              //
+// ------------------------------------------------------------ //
+
+/// Concatination reference.
+/// Proxy class which allows concatination to be used as rvalue(for reading) and
+/// lvalue(for writing)
+// ----------------------------------------------------------------
+// template <int _AP_W1, typename _AP_T1, int _AP_W2, typename _AP_T2>
+// struct ap_concat_ref {
+//#ifdef _MSC_VER
+//#pragma warning(disable : 4521 4522)
+//#endif
+//  enum {
+//    _AP_WR = _AP_W1 + _AP_W2,
+//  };
+//  _AP_T1& mbv1;
+//  _AP_T2& mbv2;
+//
+//  INLINE ap_concat_ref(const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>&
+//  ref)
+//      : mbv1(ref.mbv1), mbv2(ref.mbv2) {}
+//
+//  INLINE ap_concat_ref(_AP_T1& bv1, _AP_T2& bv2) : mbv1(bv1), mbv2(bv2) {}
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const ap_private<_AP_W3, _AP_S3>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
+//    int W_ref1 = mbv1.length();
+//    int W_ref2 = mbv2.length();
+//    ap_private<_AP_W1, false> mask1(-1);
+//    mask1 >>= _AP_W1 - W_ref1;
+//    ap_private<_AP_W2, false> mask2(-1);
+//    mask2 >>= _AP_W2 - W_ref2;
+//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
+//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
+//    return *this;
+//  }
+//
+//  INLINE ap_concat_ref& operator=(unsigned long long val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE ap_concat_ref& operator=(
+//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  INLINE ap_concat_ref& operator=(
+//      const ap_concat_ref<_AP_W1, _AP_T1, _AP_W2, _AP_T2>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const _private_bit_ref<_AP_W3, _AP_S3>&
+//  val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref& operator=(const _private_range_ref<_AP_W3, _AP_S3>&
+//  val) {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val)
+//      {
+//    return operator=((const ap_private<_AP_W3, false>)(val));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const ap_fixed_base<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>&
+//          val) {
+//    return operator=(val.to_ap_private());
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref& operator=(
+//      const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>& val) {
+//    return operator=((unsigned long long)(bool)(val));
+//  }
+//
+//  INLINE operator ap_private<_AP_WR, false>() const { return get(); }
+//
+//  INLINE operator unsigned long long() const { return get().to_uint64(); }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                       _private_range_ref<_AP_W3, _AP_S3> >
+//  operator,(const _private_range_ref<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         _private_range_ref<_AP_W3, _AP_S3> >(
+//        *this, const_cast<_private_range_ref<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
+//      >
+//      operator,(ap_private<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         ap_private<_AP_W3, _AP_S3> >(*this, a2);
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3, ap_private<_AP_W3, _AP_S3>
+//      >
+//      operator,(const ap_private<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3,
+//                         ap_private<_AP_W3, _AP_S3> >(
+//        *this, const_cast<ap_private<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
+//  _AP_S3> >
+//  operator,(const _private_bit_ref<_AP_W3, _AP_S3> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, 1, _private_bit_ref<_AP_W3,
+//    _AP_S3> >(
+//        *this, const_cast<_private_bit_ref<_AP_W3, _AP_S3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+//                       ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >
+//  operator,(const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> &a2) {
+//    return ap_concat_ref<_AP_WR, ap_concat_ref, _AP_W3 + _AP_W4,
+//                         ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4> >(
+//        *this, const_cast<ap_concat_ref<_AP_W3, _AP_T3, _AP_W4,
+//        _AP_T4>&>(a2));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE ap_concat_ref<
+//      _AP_WR, ap_concat_ref, _AP_W3,
+//      af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >
+//  operator,(
+//      const af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> &a2)
+//      {
+//    return ap_concat_ref<
+//        _AP_WR, ap_concat_ref, _AP_W3,
+//        af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+//        *this,
+//        const_cast<
+//            af_range_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//            _AP_N3>&>(a2));
+//  }
+//
+//  template <int _AP_W3, int _AP_I3, bool _AP_S3, ap_q_mode _AP_Q3,
+//            ap_o_mode _AP_O3, int _AP_N3>
+//  INLINE
+//      ap_concat_ref<_AP_WR, ap_concat_ref, 1,
+//                    af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3>
+//                    >
+//      operator,(const af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//      _AP_N3>
+//                    &a2) {
+//    return ap_concat_ref<
+//        _AP_WR, ap_concat_ref, 1,
+//        af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3, _AP_N3> >(
+//        *this,
+//        const_cast<af_bit_ref<_AP_W3, _AP_I3, _AP_S3, _AP_Q3, _AP_O3,
+//        _AP_N3>&>(
+//            a2));
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator&(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return get() & a2;
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator|(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return get() | a2;
+//  }
+//
+//  template <int _AP_W3, bool _AP_S3>
+//  INLINE ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3> operator^(
+//      const ap_private<_AP_W3, _AP_S3>& a2) {
+//    return ap_private<AP_MAX(_AP_WR, _AP_W3), _AP_S3>(get() ^ a2);
+//  }
+//
+//  INLINE const ap_private<_AP_WR, false> get() const {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
+//    int W_ref2 = mbv2.length();
+//    tmpVal <<= W_ref2;
+//    tmpVal |= tmpVal2;
+//    return tmpVal;
+//  }
+//
+//  INLINE const ap_private<_AP_WR, false> get() {
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv1.get());
+//    ap_private<_AP_W1 + _AP_W2, false> tmpVal2 =
+//        ap_private<_AP_W1 + _AP_W2, false>(mbv2.get());
+//    int W_ref2 = mbv2.length();
+//    tmpVal <<= W_ref2;
+//    tmpVal |= tmpVal2;
+//    return tmpVal;
+//  }
+//
+//  template <int _AP_W3>
+//  INLINE void set(const ap_private<_AP_W3, false>& val) {
+//    ap_private<_AP_W1 + _AP_W2, false> vval(val);
+//    int W_ref1 = mbv1.length();
+//    int W_ref2 = mbv2.length();
+//    ap_private<_AP_W1, false> mask1(-1);
+//    mask1 >>= _AP_W1 - W_ref1;
+//    ap_private<_AP_W2, false> mask2(-1);
+//    mask2 >>= _AP_W2 - W_ref2;
+//    mbv1.set(ap_private<_AP_W1, false>((vval >> W_ref2) & mask1));
+//    mbv2.set(ap_private<_AP_W2, false>(vval & mask2));
+//  }
+//
+//  INLINE int length() const { return mbv1.length() + mbv2.length(); }
+//
+//  INLINE std::string to_string(uint8_t radix = 2) const {
+//    return get().to_string(radix);
+//  }
+//}; // struct ap_concat_ref.
+
+/// Range(slice) reference
+/// Proxy class, which allows part selection to be used as rvalue(for reading)
+/// and lvalue(for writing)
+//------------------------------------------------------------
+template <int _AP_W, bool _AP_S>
+struct _private_range_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  ap_private<_AP_W, _AP_S>& d_bv;
+  int l_index;
+  int h_index;
+
+ public:
+  /// copy ctor.
+  INLINE _private_range_ref(const _private_range_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), l_index(ref.l_index), h_index(ref.h_index) {}
+
+  /// direct ctor.
+  INLINE _private_range_ref(ap_private<_AP_W, _AP_S>* bv, int h, int l)
+      : d_bv(*bv), l_index(l), h_index(h) {
+    _AP_WARNING(h < 0 || l < 0,
+                "Higher bound (%d) and lower bound (%d) cannot be "
+                "negative.",
+                h, l);
+    _AP_WARNING(h >= _AP_W || l >= _AP_W,
+                "Higher bound (%d) or lower bound (%d) out of range (%d).", h, l,
+                _AP_W);
+  }
+
+  /// compound or assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() &= "
+                "ap_private<>.range().");
+    this->d_bv |= ref.d_bv;
+    return *this;
+  }
+
+  /// compound or assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator|=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() |= _AP_ROOT_TYPE<>.");
+    this->d_bv |= ref.V;
+    return *this;
+  }
+
+  /// compound and assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() &= "
+                "ap_private<>.range().");
+    this->d_bv &= ref.d_bv;
+    return *this;
+  };
+
+  /// compound and assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator&=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() &= _AP_ROOT_TYPE<>.");
+    this->d_bv &= ref.V;
+    return *this;
+  }
+
+  /// compound xor assignment.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
+      const _private_range_ref<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index) != (ref.h_index - ref.l_index),
+                "Bitsize mismach for ap_private<>.range() ^= "
+                "ap_private<>.range().");
+    this->d_bv ^= ref.d_bv;
+    return *this;
+  };
+
+  /// compound xor assignment with root type.
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref<_AP_W, _AP_S>& operator^=(
+      const _AP_ROOT_TYPE<_AP_W2, _AP_S2>& ref) {
+    _AP_WARNING((h_index - l_index + 1) != _AP_W2,
+                "Bitsize mismach for ap_private<>.range() ^= _AP_ROOT_TYPE<>.");
+    this->d_bv ^= ref.V;
+    return *this;
+  }
+
+  /// @name convertors.
+  //  @{
+  INLINE operator ap_private<_AP_W, false>() const {
+    ap_private<_AP_W, false> val(0);
+    if (h_index >= l_index) {
+      if (_AP_W > 64) {
+        val = d_bv;
+        ap_private<_AP_W, false> mask(-1);
+        mask >>= _AP_W - (h_index - l_index + 1);
+        val >>= l_index;
+        val &= mask;
+      } else {
+        const static uint64_t mask = (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
+        val = (d_bv >> l_index) & (mask >> (_AP_W - (h_index - l_index + 1)));
+      }
+    } else {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    }
+    return val;
+  }
+
+  INLINE operator unsigned long long() const { return to_uint64(); }
+  //  @}
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(const ap_private<_AP_W2, _AP_S2>& val) {
+    ap_private<_AP_W, false> vval = ap_private<_AP_W, false>(val);
+    if (l_index > h_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
+    } else {
+      if (_AP_W > 64) {
+        ap_private<_AP_W, false> mask(-1);
+        if (l_index > 0) {
+          mask <<= l_index;
+          vval <<= l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          ap_private<_AP_W, false> mask2(-1);
+          mask2 >>= _AP_W - h_index - 1;
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask.flip();
+        d_bv &= mask;
+        d_bv |= vval;
+      } else {
+        unsigned shift = 64 - _AP_W;
+        uint64_t mask = ~0ULL >> (shift);
+        if (l_index > 0) {
+          vval = mask & vval << l_index;
+          mask = mask & mask << l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          uint64_t mask2 = mask;
+          mask2 >>= (_AP_W - h_index - 1);
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask = ~mask;
+        d_bv &= mask;
+        d_bv |= vval;
+      }
+    }
+    return *this;
+  } // operator=(const ap_private<>&)
+
+  INLINE _private_range_ref& operator=(unsigned long long val) {
+    const ap_private<_AP_W, _AP_S> vval = val;
+    return operator=(vval);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(
+      const _private_bit_ref<_AP_W2, _AP_S2>& val) {
+    return operator=((unsigned long long)(bool)val);
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE _private_range_ref& operator=(
+      const _private_range_ref<_AP_W2, _AP_S2>& val) {
+    const ap_private<_AP_W, false> tmpVal(val);
+    return operator=(tmpVal);
+  }
+
+//  template <int _AP_W3, typename _AP_T3, int _AP_W4, typename _AP_T4>
+//  INLINE _private_range_ref& operator=(
+//      const ap_concat_ref<_AP_W3, _AP_T3, _AP_W4, _AP_T4>& val) {
+//    const ap_private<_AP_W, false> tmpVal(val);
+//    return operator=(tmpVal);
+//  }
+
+  // TODO from ap_int_base, ap_bit_ref and ap_range_ref.
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const ap_fixed_base<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(val.to_ap_int_base().V);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=(val.operator ap_int_base<_AP_W2, false>().V);
+  }
+
+  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+            ap_o_mode _AP_O2, int _AP_N2>
+  INLINE _private_range_ref& operator=(
+      const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>& val) {
+    return operator=((unsigned long long)(bool)val);
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                       _private_range_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                         _private_range_ref<_AP_W2, _AP_S2> >(
+//        *this, const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                       ap_private<_AP_W2, _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2,
+//                         ap_private<_AP_W2, _AP_S2> >(*this, a2);
+//  }
+//
+//  INLINE
+//  ap_concat_ref<_AP_W, _private_range_ref, _AP_W, ap_private<_AP_W, _AP_S> >
+//  operator,(ap_private<_AP_W, _AP_S>& a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W,
+//                         ap_private<_AP_W, _AP_S> >(*this, a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                       _private_bit_ref<_AP_W2, _AP_S2> >
+//  operator,(const _private_bit_ref<_AP_W2, _AP_S2> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                         _private_bit_ref<_AP_W2, _AP_S2> >(
+//        *this, const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) {
+//    return ap_concat_ref<_AP_W, _private_range_ref, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        *this, const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      _AP_W, _private_range_ref, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(
+//      const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> &a2) {
+//    return ap_concat_ref<
+//        _AP_W, _private_range_ref, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        *this,
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<_AP_W, _private_range_ref, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>
+//                    &a2) {
+//    return ap_concat_ref<
+//        _AP_W, _private_range_ref, 1,
+//        af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        *this,
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2>&>(
+//            a2));
+//  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs == rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs != rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs > rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator>=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs >= rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs < rhs;
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator<=(const _private_range_ref<_AP_W2, _AP_S2>& op2) {
+    ap_private<_AP_W, false> lhs = get();
+    ap_private<_AP_W2, false> rhs = op2.get();
+    return lhs <= rhs;
+  }
+
+  template <int _AP_W2>
+  INLINE void set(const ap_private<_AP_W2, false>& val) {
+    ap_private<_AP_W, _AP_S> vval = val;
+    if (l_index > h_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        (vval)[i] ? d_bv.set(j) : d_bv.clear(j);
+    } else {
+      if (_AP_W > 64) {
+        ap_private<_AP_W, _AP_S> mask(-1);
+        if (l_index > 0) {
+          ap_private<_AP_W, false> mask1(-1);
+          mask1 >>= _AP_W - l_index;
+          mask1.flip();
+          mask = mask1;
+          // vval&=mask1;
+          vval <<= l_index;
+        }
+        if (h_index < _AP_W - 1) {
+          ap_private<_AP_W, false> mask2(-1);
+          mask2 <<= h_index + 1;
+          mask2.flip();
+          mask &= mask2;
+          vval &= mask2;
+        }
+        mask.flip();
+        d_bv &= mask;
+        d_bv |= vval;
+      } else {
+        uint64_t mask = ~0ULL >> (64 - _AP_W);
+        if (l_index > 0) {
+          uint64_t mask1 = mask;
+          mask1 = mask & (mask1 >> (_AP_W - l_index));
+          vval = mask & (vval << l_index);
+          mask = ~mask1 & mask;
+          // vval&=mask1;
+        }
+        if (h_index < _AP_W - 1) {
+          uint64_t mask2 = ~0ULL >> (64 - _AP_W);
+          mask2 = mask & (mask2 << (h_index + 1));
+          mask &= ~mask2;
+          vval &= ~mask2;
+        }
+        d_bv &= (~mask & (~0ULL >> (64 - _AP_W)));
+        d_bv |= vval;
+      }
+    }
+  }
+
+  INLINE ap_private<_AP_W, false> get() const {
+    ap_private<_AP_W, false> val(0);
+    if (h_index < l_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    } else {
+      val = d_bv;
+      val >>= l_index;
+      if (h_index < _AP_W - 1) {
+        if (_AP_W <= 64) {
+          const static uint64_t mask =
+              (~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0));
+          val &= (mask >> (_AP_W - (h_index - l_index + 1)));
+        } else {
+          ap_private<_AP_W, false> mask(-1);
+          mask >>= _AP_W - (h_index - l_index + 1);
+          val &= mask;
+        }
+      }
+    }
+    return val;
+  }
+
+  INLINE ap_private<_AP_W, false> get() {
+    ap_private<_AP_W, false> val(0);
+    if (h_index < l_index) {
+      for (int i = 0, j = l_index; j >= 0 && j >= h_index; j--, i++)
+        if ((d_bv)[j]) val.set(i);
+    } else {
+      val = d_bv;
+      val >>= l_index;
+      if (h_index < _AP_W - 1) {
+        if (_AP_W <= 64) {
+          static const uint64_t mask = ~0ULL >> (64 > _AP_W ? (64 - _AP_W) : 0);
+          return val &= ((mask) >> (_AP_W - (h_index - l_index + 1)));
+        } else {
+          ap_private<_AP_W, false> mask(-1);
+          mask >>= _AP_W - (h_index - l_index + 1);
+          val &= mask;
+        }
+      }
+    }
+    return val;
+  }
+
+  INLINE int length() const {
+    return h_index >= l_index ? h_index - l_index + 1 : l_index - h_index + 1;
+  }
+
+  INLINE int to_int() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_int();
+  }
+
+  INLINE unsigned int to_uint() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_uint();
+  }
+
+  INLINE long to_long() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_long();
+  }
+
+  INLINE unsigned long to_ulong() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_ulong();
+  }
+
+  INLINE ap_slong to_int64() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_int64();
+  }
+
+  INLINE ap_ulong to_uint64() const {
+    ap_private<_AP_W, false> val = get();
+    return val.to_uint64();
+  }
+
+  INLINE std::string to_string(uint8_t radix = 2) const {
+    return get().to_string(radix);
+  }
+
+  INLINE bool and_reduce() {
+    bool ret = true;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret &= d_bv[i];
+    return ret;
+  }
+
+  INLINE bool or_reduce() {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret |= d_bv[i];
+    return ret;
+  }
+
+  INLINE bool xor_reduce() {
+    bool ret = false;
+    bool reverse = l_index > h_index;
+    unsigned low = reverse ? h_index : l_index;
+    unsigned high = reverse ? l_index : h_index;
+    for (unsigned i = low; i != high; ++i) ret ^= d_bv[i];
+    return ret;
+  }
+}; // struct _private_range_ref.
+
+/// Bit reference
+/// Proxy class, which allows bit selection to be used as rvalue(for reading)
+/// and lvalue(for writing)
+//--------------------------------------------------------------
+template <int _AP_W, bool _AP_S>
+struct _private_bit_ref {
+#ifdef _MSC_VER
+#pragma warning(disable : 4521 4522)
+#endif
+  ap_private<_AP_W, _AP_S>& d_bv;
+  int d_index;
+
+ public:
+  // copy ctor.
+  INLINE _private_bit_ref(const _private_bit_ref<_AP_W, _AP_S>& ref)
+      : d_bv(ref.d_bv), d_index(ref.d_index) {}
+
+  // director ctor.
+  INLINE _private_bit_ref(ap_private<_AP_W, _AP_S>& bv, int index = 0)
+      : d_bv(bv), d_index(index) {
+    _AP_WARNING(d_index < 0, "Index of bit vector  (%d) cannot be negative.\n",
+                d_index);
+    _AP_WARNING(d_index >= _AP_W,
+                "Index of bit vector (%d) out of range (%d).\n", d_index, _AP_W);
+  }
+
+  INLINE operator bool() const { return d_bv.get_bit(d_index); }
+
+  INLINE bool to_bool() const { return operator bool(); }
+
+  template <typename T>
+  INLINE _private_bit_ref& operator=(const T& val) {
+    if (!!val)
+      d_bv.set(d_index);
+    else
+      d_bv.clear(d_index);
+    return *this;
+  }
+
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
+//  _AP_S2> >
+//  operator,(ap_private<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2, ap_private<_AP_W2,
+//    _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this), a2);
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2,
+//  _private_range_ref<_AP_W2,
+//  _AP_S2> >
+//  operator,(const _private_range_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2,
+//    _private_range_ref<_AP_W2,
+//    _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_range_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, bool _AP_S2>
+//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref<_AP_W2,
+//  _AP_S2> > operator,(
+//      const _private_bit_ref<_AP_W2, _AP_S2> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1,
+//    _private_bit_ref<_AP_W2, _AP_S2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref<_AP_W2, _AP_S2>&>(a2));
+//  }
+//
+//  INLINE ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>
+//  operator,(
+//      const _private_bit_ref &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1, _private_bit_ref>(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<_private_bit_ref&>(a2));
+//  }
+//
+//  template <int _AP_W2, typename _AP_T2, int _AP_W3, typename _AP_T3>
+//  INLINE ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
+//                       ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >
+//  operator,(const ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, _AP_W2 + _AP_W3,
+//                         ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<ap_concat_ref<_AP_W2, _AP_T2, _AP_W3, _AP_T3>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE ap_concat_ref<
+//      1, _private_bit_ref, _AP_W2,
+//      af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >
+//  operator,(const af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//  _AP_N2>
+//                &a2) const {
+//    return ap_concat_ref<
+//        1, _private_bit_ref, _AP_W2,
+//        af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2, _AP_N2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<
+//            af_range_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//            _AP_N2>&>(a2));
+//  }
+//
+//  template <int _AP_W2, int _AP_I2, bool _AP_S2, ap_q_mode _AP_Q2,
+//            ap_o_mode _AP_O2, int _AP_N2>
+//  INLINE
+//      ap_concat_ref<1, _private_bit_ref, 1,
+//                    af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//                    _AP_N2> >
+//      operator,(const af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//      _AP_N2>
+//                    &a2) const {
+//    return ap_concat_ref<1, _private_bit_ref, 1, af_bit_ref<_AP_W2,
+//    _AP_I2, _AP_S2,
+//                                                      _AP_Q2, _AP_O2,
+//                                                      _AP_N2> >(
+//        const_cast<_private_bit_ref<_AP_W, _AP_S>&>(*this),
+//        const_cast<af_bit_ref<_AP_W2, _AP_I2, _AP_S2, _AP_Q2, _AP_O2,
+//        _AP_N2>&>(
+//            a2));
+//  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator==(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
+    return get() == op.get();
+  }
+
+  template <int _AP_W2, bool _AP_S2>
+  INLINE bool operator!=(const _private_bit_ref<_AP_W2, _AP_S2>& op) const {
+    return get() != op.get();
+  }
+
+  INLINE bool get() const { return operator bool(); }
+
+  //  template <int _AP_W3>
+  //  INLINE void set(const ap_private<_AP_W3, false>& val) {
+  //    operator=(val);
+  //  }
+
+  //  INLINE bool operator~() const {
+  //    bool bit = (d_bv)[d_index];
+  //    return bit ? false : true;
+  //  }
+
+  INLINE int length() const { return 1; }
+
+  //  INLINE std::string to_string() const {
+  //    bool val = get();
+  //    return val ? "1" : "0";
+  //  }
+
+}; // struct _private_bit_ref.
+
+// char a[100];
+// char* ptr = a;
+// ap_int<2> n = 3;
+// char* ptr2 = ptr + n*2;
+// avoid ambiguous errors
+#define OP_BIN_MIX_PTR(BIN_OP)                                           \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
+  INLINE PTR_TYPE* operator BIN_OP(PTR_TYPE* i_op,                       \
+                                   const ap_private<_AP_W, _AP_S>& op) { \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
+    return i_op BIN_OP op2;                                              \
+  }                                                                      \
+  template <typename PTR_TYPE, int _AP_W, bool _AP_S>                    \
+  INLINE PTR_TYPE* operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
+                                   PTR_TYPE* i_op) {                     \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;                 \
+    return op2 BIN_OP i_op;                                              \
+  }
+
+OP_BIN_MIX_PTR(+)
+OP_BIN_MIX_PTR(-)
+#undef OP_BIN_MIX_PTR
+
+// float OP ap_int
+// when ap_int<wa>'s width > 64, then trunc ap_int<w> to ap_int<64>
+#define OP_BIN_MIX_FLOAT(BIN_OP, C_TYPE)                              \
+  template <int _AP_W, bool _AP_S>                                    \
+  INLINE C_TYPE operator BIN_OP(C_TYPE i_op,                          \
+                                const ap_private<_AP_W, _AP_S>& op) { \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
+    return i_op BIN_OP op2;                                           \
+  }                                                                   \
+  template <int _AP_W, bool _AP_S>                                    \
+  INLINE C_TYPE operator BIN_OP(const ap_private<_AP_W, _AP_S>& op,   \
+                                C_TYPE i_op) {                        \
+    typename ap_private<_AP_W, _AP_S>::ValType op2 = op;              \
+    return op2 BIN_OP i_op;                                           \
+  }
+
+#define OPS_MIX_FLOAT(C_TYPE) \
+  OP_BIN_MIX_FLOAT(*, C_TYPE) \
+  OP_BIN_MIX_FLOAT(/, C_TYPE) \
+  OP_BIN_MIX_FLOAT(+, C_TYPE) \
+  OP_BIN_MIX_FLOAT(-, C_TYPE)
+
+OPS_MIX_FLOAT(float)
+OPS_MIX_FLOAT(double)
+#undef OP_BIN_MIX_FLOAT
+#undef OPS_MIX_FLOAT
+
+/// Operators mixing Integers with AP_Int
+// ----------------------------------------------------------------
+
+// partially specialize template argument _AP_C in order that:
+// for _AP_W > 64, we will explicitly convert operand with native data type
+// into corresponding ap_private
+// for _AP_W <= 64, we will implicitly convert operand with ap_private into
+// (unsigned) long long
+#define OP_BIN_MIX_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                  \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_WI, _AP_SI>::template RType<_AP_W, _AP_S>::RTYPE \
+      operator BIN_OP(C_TYPE i_op, const ap_private<_AP_W, _AP_S>& op) {       \
+    return ap_private<_AP_WI, _AP_SI>(i_op).operator BIN_OP(op);               \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
+      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
+    return op.operator BIN_OP(ap_private<_AP_WI, _AP_SI>(i_op));               \
+  }
+
+#define OP_REL_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                     \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(const ap_private<_AP_W, _AP_S>& op,          \
+                              C_TYPE op2) {                                \
+    return op.operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));            \
+  }                                                                        \
+  template <int _AP_W, bool _AP_S>                                         \
+  INLINE bool operator REL_OP(C_TYPE op2,                                  \
+                              const ap_private<_AP_W, _AP_S, false>& op) { \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(op);            \
+  }
+
+#define OP_ASSIGN_MIX_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2)       \
+  template <int _AP_W, bool _AP_S>                                 \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(             \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {                  \
+    return op.operator ASSIGN_OP(ap_private<_AP_W2, _AP_S2>(op2)); \
+  }
+
+#define OP_BIN_SHIFT_INT(BIN_OP, C_TYPE, _AP_WI, _AP_SI, RTYPE)                \
+  template <int _AP_W, bool _AP_S>                                             \
+  C_TYPE operator BIN_OP(C_TYPE i_op,                                          \
+                         const ap_private<_AP_W, _AP_S, false>& op) {          \
+    return i_op BIN_OP(op.get_VAL());                                          \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, _AP_S>::template RType<_AP_WI, _AP_SI>::RTYPE \
+      operator BIN_OP(const ap_private<_AP_W, _AP_S>& op, C_TYPE i_op) {       \
+    return op.operator BIN_OP(i_op);                                           \
+  }
+
+#define OP_ASSIGN_RSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
+  template <int _AP_W, bool _AP_S>                              \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
+    op = op.operator>>(op2);                                    \
+    return op;                                                  \
+  }
+
+#define OP_ASSIGN_LSHIFT_INT(ASSIGN_OP, C_TYPE, _AP_W2, _AP_S2) \
+  template <int _AP_W, bool _AP_S>                              \
+  INLINE ap_private<_AP_W, _AP_S>& operator ASSIGN_OP(          \
+      ap_private<_AP_W, _AP_S>& op, C_TYPE op2) {               \
+    op = op.operator<<(op2);                                    \
+    return op;                                                  \
+  }
+
+#define OPS_MIX_INT(C_TYPE, _AP_W2, _AP_S2)              \
+  OP_BIN_MIX_INT(*, C_TYPE, (_AP_W2), (_AP_S2), mult)    \
+  OP_BIN_MIX_INT(+, C_TYPE, (_AP_W2), (_AP_S2), plus)    \
+  OP_BIN_MIX_INT(-, C_TYPE, (_AP_W2), (_AP_S2), minus)   \
+  OP_BIN_MIX_INT(/, C_TYPE, (_AP_W2), (_AP_S2), div)     \
+  OP_BIN_MIX_INT(%, C_TYPE, (_AP_W2), (_AP_S2), mod)     \
+  OP_BIN_MIX_INT(&, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
+  OP_BIN_MIX_INT(|, C_TYPE, (_AP_W2), (_AP_S2), logic)   \
+  OP_BIN_MIX_INT (^, C_TYPE, (_AP_W2), (_AP_S2), logic)  \
+  OP_BIN_SHIFT_INT(>>, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
+  OP_BIN_SHIFT_INT(<<, C_TYPE, (_AP_W2), (_AP_S2), arg1) \
+                                                         \
+  OP_ASSIGN_MIX_INT(+=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(-=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(*=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(/=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(%=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(&=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(|=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_MIX_INT(^=, C_TYPE, (_AP_W2), (_AP_S2))      \
+  OP_ASSIGN_RSHIFT_INT(>>=, C_TYPE, (_AP_W2), (_AP_S2))  \
+  OP_ASSIGN_LSHIFT_INT(<<=, C_TYPE, (_AP_W2), (_AP_S2))  \
+                                                         \
+  OP_REL_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))          \
+  OP_REL_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))          \
+  OP_REL_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2))         \
+  OP_REL_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+OPS_MIX_INT(bool, 1, false)
+OPS_MIX_INT(char, 8, CHAR_IS_SIGNED)
+OPS_MIX_INT(signed char, 8, true)
+OPS_MIX_INT(unsigned char, 8, false)
+OPS_MIX_INT(short, sizeof(short) * 8, true)
+OPS_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+OPS_MIX_INT(int, sizeof(int) * 8, true)
+OPS_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+OPS_MIX_INT(long, sizeof(long) * 8, true)
+OPS_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+OPS_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+OPS_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+
+#undef OP_BIN_MIX_INT
+#undef OP_BIN_SHIFT_INT
+#undef OP_ASSIGN_MIX_INT
+#undef OP_ASSIGN_RSHIFT_INT
+#undef OP_ASSIGN_LSHIFT_INT
+#undef OP_REL_MIX_INT
+#undef OPS_MIX_INT
+
+#define OP_BIN_MIX_RANGE(BIN_OP, RTYPE)                                     \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                             _AP_S2>::RTYPE \
+  operator BIN_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,            \
+                  const ap_private<_AP_W2, _AP_S2>& op2) {                  \
+    return ap_private<_AP_W1, false>(op1).operator BIN_OP(op2);             \
+  }                                                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>               \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<_AP_W2,        \
+                                                             _AP_S2>::RTYPE \
+  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                    \
+                  const _private_range_ref<_AP_W2, _AP_S2>& op2) {          \
+    return op1.operator BIN_OP(ap_private<_AP_W2, false>(op2));             \
+  }
+
+#define OP_ASSIGN_MIX_RANGE(ASSIGN_OP)                             \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
+  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(           \
+      ap_private<_AP_W1, _AP_S1>& op1,                             \
+      const _private_range_ref<_AP_W2, _AP_S2>& op2) {             \
+    return op1.operator ASSIGN_OP(ap_private<_AP_W2, false>(op2)); \
+  }                                                                \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>      \
+  INLINE _private_range_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP(   \
+      _private_range_ref<_AP_W1, _AP_S1>& op1,                     \
+      ap_private<_AP_W2, _AP_S2>& op2) {                           \
+    ap_private<_AP_W1, false> tmp(op1);                            \
+    tmp.operator ASSIGN_OP(op2);                                   \
+    op1 = tmp;                                                     \
+    return op1;                                                    \
+  }
+
+#define OP_REL_MIX_RANGE(REL_OP)                                               \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE bool operator REL_OP(const _private_range_ref<_AP_W1, _AP_S1>& op1,   \
+                              const ap_private<_AP_W2, _AP_S2>& op2) {         \
+    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);                \
+  }                                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                  \
+  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,           \
+                              const _private_range_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1.operator REL_OP(op2.operator ap_private<_AP_W2, false>());      \
+  }
+
+OP_BIN_MIX_RANGE(+, plus)
+OP_BIN_MIX_RANGE(-, minus)
+OP_BIN_MIX_RANGE(*, mult)
+OP_BIN_MIX_RANGE(/, div)
+OP_BIN_MIX_RANGE(%, mod)
+OP_BIN_MIX_RANGE(&, logic)
+OP_BIN_MIX_RANGE(|, logic)
+OP_BIN_MIX_RANGE(^, logic)
+OP_BIN_MIX_RANGE(>>, arg1)
+OP_BIN_MIX_RANGE(<<, arg1)
+#undef OP_BIN_MIX_RANGE
+
+OP_ASSIGN_MIX_RANGE(+=)
+OP_ASSIGN_MIX_RANGE(-=)
+OP_ASSIGN_MIX_RANGE(*=)
+OP_ASSIGN_MIX_RANGE(/=)
+OP_ASSIGN_MIX_RANGE(%=)
+OP_ASSIGN_MIX_RANGE(&=)
+OP_ASSIGN_MIX_RANGE(|=)
+OP_ASSIGN_MIX_RANGE(^=)
+OP_ASSIGN_MIX_RANGE(>>=)
+OP_ASSIGN_MIX_RANGE(<<=)
+#undef OP_ASSIGN_MIX_RANGE
+
+OP_REL_MIX_RANGE(>)
+OP_REL_MIX_RANGE(<)
+OP_REL_MIX_RANGE(>=)
+OP_REL_MIX_RANGE(<=)
+OP_REL_MIX_RANGE(==)
+OP_REL_MIX_RANGE(!=)
+#undef OP_REL_MIX_RANGE
+
+#define OP_BIN_MIX_BIT(BIN_OP, RTYPE)                                         \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
+  INLINE typename ap_private<1, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+  operator BIN_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,                \
+                  const ap_private<_AP_W2, _AP_S2>& op2) {                    \
+    return ap_private<1, false>(op1).operator BIN_OP(op2);                    \
+  }                                                                           \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                 \
+  INLINE typename ap_private<_AP_W1, _AP_S1>::template RType<1, false>::RTYPE \
+  operator BIN_OP(const ap_private<_AP_W1, _AP_S1>& op1,                      \
+                  const _private_bit_ref<_AP_W2, _AP_S2>& op2) {              \
+    return op1.operator BIN_OP(ap_private<1, false>(op2));                    \
+  }
+
+#define OP_ASSIGN_MIX_BIT(ASSIGN_OP)                           \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
+  INLINE ap_private<_AP_W1, _AP_S1>& operator ASSIGN_OP(       \
+      ap_private<_AP_W1, _AP_S1>& op1,                         \
+      _private_bit_ref<_AP_W2, _AP_S2>& op2) {                 \
+    return op1.operator ASSIGN_OP(ap_private<1, false>(op2));  \
+  }                                                            \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>  \
+  INLINE _private_bit_ref<_AP_W1, _AP_S1>& operator ASSIGN_OP( \
+      _private_bit_ref<_AP_W1, _AP_S1>& op1,                   \
+      ap_private<_AP_W2, _AP_S2>& op2) {                       \
+    ap_private<1, false> tmp(op1);                             \
+    tmp.operator ASSIGN_OP(op2);                               \
+    op1 = tmp;                                                 \
+    return op1;                                                \
+  }
+
+#define OP_REL_MIX_BIT(REL_OP)                                               \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W1, _AP_S1>& op1,   \
+                              const ap_private<_AP_W2, _AP_S2>& op2) {       \
+    return ap_private<_AP_W1, false>(op1).operator REL_OP(op2);              \
+  }                                                                          \
+  template <int _AP_W1, bool _AP_S1, int _AP_W2, bool _AP_S2>                \
+  INLINE bool operator REL_OP(const ap_private<_AP_W1, _AP_S1>& op1,         \
+                              const _private_bit_ref<_AP_W2, _AP_S2>& op2) { \
+    return op1.operator REL_OP(ap_private<1, false>(op2));                   \
+  }
+
+OP_ASSIGN_MIX_BIT(+=)
+OP_ASSIGN_MIX_BIT(-=)
+OP_ASSIGN_MIX_BIT(*=)
+OP_ASSIGN_MIX_BIT(/=)
+OP_ASSIGN_MIX_BIT(%=)
+OP_ASSIGN_MIX_BIT(&=)
+OP_ASSIGN_MIX_BIT(|=)
+OP_ASSIGN_MIX_BIT(^=)
+OP_ASSIGN_MIX_BIT(>>=)
+OP_ASSIGN_MIX_BIT(<<=)
+#undef OP_ASSIGN_MIX_BIT
+
+OP_BIN_MIX_BIT(+, plus)
+OP_BIN_MIX_BIT(-, minus)
+OP_BIN_MIX_BIT(*, mult)
+OP_BIN_MIX_BIT(/, div)
+OP_BIN_MIX_BIT(%, mod)
+OP_BIN_MIX_BIT(&, logic)
+OP_BIN_MIX_BIT(|, logic)
+OP_BIN_MIX_BIT(^, logic)
+OP_BIN_MIX_BIT(>>, arg1)
+OP_BIN_MIX_BIT(<<, arg1)
+#undef OP_BIN_MIX_BIT
+
+OP_REL_MIX_BIT(>)
+OP_REL_MIX_BIT(<)
+OP_REL_MIX_BIT(<=)
+OP_REL_MIX_BIT(>=)
+OP_REL_MIX_BIT(==)
+OP_REL_MIX_BIT(!=)
+#undef OP_REL_MIX_BIT
+
+#define REF_REL_OP_MIX_INT(REL_OP, C_TYPE, _AP_W2, _AP_S2)                  \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(const _private_range_ref<_AP_W, _AP_S>& op,   \
+                              C_TYPE op2) {                                 \
+    return (ap_private<_AP_W, false>(op))                                   \
+        .                                                                   \
+        operator REL_OP(ap_private<_AP_W2, _AP_S2>(op2));                   \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(C_TYPE op2,                                   \
+                              const _private_range_ref<_AP_W, _AP_S>& op) { \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator REL_OP(                 \
+        ap_private<_AP_W, false>(op));                                      \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(const _private_bit_ref<_AP_W, _AP_S>& op,     \
+                              C_TYPE op2) {                                 \
+    return (bool(op))REL_OP op2;                                            \
+  }                                                                         \
+  template <int _AP_W, bool _AP_S>                                          \
+  INLINE bool operator REL_OP(C_TYPE op2,                                   \
+                              const _private_bit_ref<_AP_W, _AP_S>& op) {   \
+    return op2 REL_OP(bool(op));                                            \
+  }
+
+#define REF_REL_MIX_INT(C_TYPE, _AP_W2, _AP_S2)      \
+  REF_REL_OP_MIX_INT(>, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_REL_OP_MIX_INT(<, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_REL_OP_MIX_INT(>=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(<=, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(==, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_REL_OP_MIX_INT(!=, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_REL_MIX_INT(bool, 1, false)
+REF_REL_MIX_INT(char, 8, CHAR_IS_SIGNED)
+REF_REL_MIX_INT(signed char, 8, true)
+REF_REL_MIX_INT(unsigned char, 8, false)
+REF_REL_MIX_INT(short, sizeof(short) * 8, true)
+REF_REL_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+REF_REL_MIX_INT(int, sizeof(int) * 8, true)
+REF_REL_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+REF_REL_MIX_INT(long, sizeof(long) * 8, true)
+REF_REL_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+REF_REL_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+REF_REL_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef REF_REL_OP_MIX_INT
+#undef REF_REL_MIX_INT
+
+#define REF_BIN_OP_MIX_INT(BIN_OP, RTYPE, C_TYPE, _AP_W2, _AP_S2)              \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W, false>::template RType<_AP_W2, _AP_S2>::RTYPE \
+      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& op,              \
+                      C_TYPE op2) {                                            \
+    return (ap_private<_AP_W, false>(op))                                      \
+        .                                                                      \
+        operator BIN_OP(ap_private<_AP_W2, _AP_S2>(op2));                      \
+  }                                                                            \
+  template <int _AP_W, bool _AP_S>                                             \
+  INLINE                                                                       \
+      typename ap_private<_AP_W2, _AP_S2>::template RType<_AP_W, false>::RTYPE \
+      operator BIN_OP(C_TYPE op2,                                              \
+                      const _private_range_ref<_AP_W, _AP_S>& op) {            \
+    return ap_private<_AP_W2, _AP_S2>(op2).operator BIN_OP(                    \
+        ap_private<_AP_W, false>(op));                                         \
+  }
+
+#define REF_BIN_MIX_INT(C_TYPE, _AP_W2, _AP_S2)            \
+  REF_BIN_OP_MIX_INT(+, plus, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_MIX_INT(-, minus, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(*, mult, C_TYPE, (_AP_W2), (_AP_S2))  \
+  REF_BIN_OP_MIX_INT(/, div, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_MIX_INT(%, mod, C_TYPE, (_AP_W2), (_AP_S2))   \
+  REF_BIN_OP_MIX_INT(&, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(|, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(^, logic, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(>>, arg1, C_TYPE, (_AP_W2), (_AP_S2)) \
+  REF_BIN_OP_MIX_INT(<<, arg1, C_TYPE, (_AP_W2), (_AP_S2))
+
+REF_BIN_MIX_INT(bool, 1, false)
+REF_BIN_MIX_INT(char, 8, CHAR_IS_SIGNED)
+REF_BIN_MIX_INT(signed char, 8, true)
+REF_BIN_MIX_INT(unsigned char, 8, false)
+REF_BIN_MIX_INT(short, sizeof(short) * 8, true)
+REF_BIN_MIX_INT(unsigned short, sizeof(unsigned short) * 8, false)
+REF_BIN_MIX_INT(int, sizeof(int) * 8, true)
+REF_BIN_MIX_INT(unsigned int, sizeof(unsigned int) * 8, false)
+REF_BIN_MIX_INT(long, sizeof(long) * 8, true)
+REF_BIN_MIX_INT(unsigned long, sizeof(unsigned long) * 8, false)
+REF_BIN_MIX_INT(ap_slong, sizeof(ap_slong) * 8, true)
+REF_BIN_MIX_INT(ap_ulong, sizeof(ap_ulong) * 8, false)
+#undef REF_BIN_OP_MIX_INT
+#undef REF_BIN_MIX_INT
+
+#define REF_BIN_OP(BIN_OP, RTYPE)                                             \
+  template <int _AP_W, bool _AP_S, int _AP_W2, bool _AP_S2>                   \
+  INLINE                                                                      \
+      typename ap_private<_AP_W, false>::template RType<_AP_W2, false>::RTYPE \
+      operator BIN_OP(const _private_range_ref<_AP_W, _AP_S>& lhs,            \
+                      const _private_range_ref<_AP_W2, _AP_S2>& rhs) {        \
+    return ap_private<_AP_W, false>(lhs).operator BIN_OP(                     \
+        ap_private<_AP_W2, false>(rhs));                                      \
+  }
+
+REF_BIN_OP(+, plus)
+REF_BIN_OP(-, minus)
+REF_BIN_OP(*, mult)
+REF_BIN_OP(/, div)
+REF_BIN_OP(%, mod)
+REF_BIN_OP(&, logic)
+REF_BIN_OP(|, logic)
+REF_BIN_OP(^, logic)
+REF_BIN_OP(>>, arg1)
+REF_BIN_OP(<<, arg1)
+#undef REF_BIN_OP
+
+//************************************************************************
+//  Implement
+//      ap_private<M+N> = ap_concat_ref<M> OP ap_concat_ref<N>
+//  for operators  +, -, *, /, %, >>, <<, &, |, ^
+//  Without these operators the operands are converted to int64 and
+//  larger results lose informations (higher order bits).
+//
+//                       operand OP
+//                      /          |
+//              left-concat        right-concat
+//                /     |           /         |
+//         <LW1,LT1>  <LW2,LT2>   <RW1,RT1>   <RW2,RT2>
+//
+//      _AP_LW1, _AP_LT1 (width and type of left-concat's left side)
+//      _AP_LW2, _AP_LT2 (width and type of left-concat's right side)
+//  Similarly for RHS of operand OP: _AP_RW1, AP_RW2, _AP_RT1, _AP_RT2
+//
+//  In Verilog 2001 result of concatenation is always unsigned even
+//  when both sides are signed.
+//************************************************************************
+
+#endif // ifndef __AP_PRIVATE_H__
+
+// -*- cpp -*-
diff --git a/hls4ml/templates/vivado/ap_types/hls_stream.h b/hls4ml/templates/vivado/ap_types/hls_stream.h
index 317125d351..f516c39e08 100644
--- a/hls4ml/templates/vivado/ap_types/hls_stream.h
+++ b/hls4ml/templates/vivado/ap_types/hls_stream.h
@@ -1,263 +1,263 @@
-/*
-#-  (c) Copyright 2011-2018 Xilinx, Inc. All rights reserved.
-#-
-#-  This file contains confidential and proprietary information
-#-  of Xilinx, Inc. and is protected under U.S. and
-#-  international copyright and other intellectual property
-#-  laws.
-#-
-#-  DISCLAIMER
-#-  This disclaimer is not a license and does not grant any
-#-  rights to the materials distributed herewith. Except as
-#-  otherwise provided in a valid license issued to you by
-#-  Xilinx, and to the maximum extent permitted by applicable
-#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
-#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
-#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
-#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
-#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
-#-  (2) Xilinx shall not be liable (whether in contract or tort,
-#-  including negligence, or under any other theory of
-#-  liability) for any loss or damage of any kind or nature
-#-  related to, arising under or in connection with these
-#-  materials, including for any direct, or any indirect,
-#-  special, incidental, or consequential loss or damage
-#-  (including loss of data, profits, goodwill, or any type of
-#-  loss or damage suffered as a result of any action brought
-#-  by a third party) even if such damage or loss was
-#-  reasonably foreseeable or Xilinx had been advised of the
-#-  possibility of the same.
-#-
-#-  CRITICAL APPLICATIONS
-#-  Xilinx products are not designed or intended to be fail-
-#-  safe, or for use in any application requiring fail-safe
-#-  performance, such as life-support or safety devices or
-#-  systems, Class III medical devices, nuclear facilities,
-#-  applications related to the deployment of airbags, or any
-#-  other applications that could lead to death, personal
-#-  injury, or severe property or environmental damage
-#-  (individually and collectively, "Critical
-#-  Applications"). Customer assumes the sole risk and
-#-  liability of any use of Xilinx products in Critical
-#-  Applications, subject only to applicable laws and
-#-  regulations governing limitations on product liability.
-#-
-#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
-#-  PART OF THIS FILE AT ALL TIMES. 
-#- ************************************************************************
-
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#ifndef X_HLS_STREAM_SIM_H
-#define X_HLS_STREAM_SIM_H
-
-/*
- * This file contains a C++ model of hls::stream.
- * It defines C simulation model.
- */
-#ifndef __cplusplus
-
-#error C++ is required to include this header file
-
-#else
-
-//////////////////////////////////////////////
-// C level simulation models for hls::stream
-//////////////////////////////////////////////
-#include <queue>
-#include <iostream>
-#include <typeinfo>
-#include <string>
-#include <sstream>
-
-#ifdef HLS_STREAM_THREAD_SAFE
-#include <mutex>
-#include <condition_variable>
-#endif
-
-#ifndef _MSC_VER
-#include <cxxabi.h>
-#include <stdlib.h>
-#endif
-
-namespace hls {
-
-template<typename __STREAM_T__>
-class stream
-{
-  protected:
-    std::string _name;
-    std::deque<__STREAM_T__> _data; // container for the elements
-#ifdef HLS_STREAM_THREAD_SAFE
-    std::mutex _mutex;
-    std::condition_variable _condition_var;
-#endif    
-
-  public:
-    /// Constructors
-    // Keep consistent with the synthesis model's constructors
-    stream() {
-        static unsigned _counter = 1;
-        std::stringstream ss;
-#ifndef _MSC_VER
-        char* _demangle_name = abi::__cxa_demangle(typeid(*this).name(), 0, 0, 0);
-        if (_demangle_name) {
-            _name = _demangle_name;
-            free(_demangle_name);
-        }
-        else {
-            _name = "hls_stream";
-        }
-#else
-        _name = typeid(*this).name();
-#endif
-
-        ss << _counter++;
-        _name += "." + ss.str();
-    }
-
-    stream(const std::string name) {
-    // default constructor,
-    // capacity set to predefined maximum
-        _name = name;
-    }
-
-  /// Make copy constructor and assignment operator private
-  private:
-    stream(const stream< __STREAM_T__ >& chn):
-        _name(chn._name), _data(chn._data) {
-    }
-
-    stream& operator = (const stream< __STREAM_T__ >& chn) {
-        _name = chn._name;
-        _data = chn._data;
-        return *this;
-    }
-
-  public:
-    /// Overload >> and << operators to implement read() and write()
-    void operator >> (__STREAM_T__& rdata) {
-        read(rdata);
-    }
-
-    void operator << (const __STREAM_T__& wdata) {
-        write(wdata);
-    }
-
-
-  public:
-    /// Destructor
-    /// Check status of the queue
-    virtual ~stream() {
-        if (!_data.empty())
-        {
-            std::cout << "WARNING: Hls::stream '" 
-                      << _name 
-                      << "' contains leftover data,"
-                      << " which may result in RTL simulation hanging."
-                      << std::endl;
-        }
-    }
-
-    /// Status of the queue
-    bool empty() {
-#ifdef HLS_STREAM_THREAD_SAFE
-        std::lock_guard<std::mutex> lg(_mutex);
-#endif
-        return _data.empty();
-    }    
-
-    bool full() const { return false; }
-
-    /// Blocking read
-    void read(__STREAM_T__& head) {
-        head = read();
-    }
-
-#ifdef HLS_STREAM_THREAD_SAFE
-    __STREAM_T__ read() {
-        std::unique_lock<std::mutex> ul(_mutex);
-        while (_data.empty()) {
-            _condition_var.wait(ul);
-        }
-
-        __STREAM_T__ elem;
-        elem = _data.front();
-        _data.pop_front();
-        return elem;
-    }
-#else
-    __STREAM_T__ read() {
-        __STREAM_T__ elem;
-        if (_data.empty()) {
-            std::cout << "WARNING: Hls::stream '"
-                      << _name 
-                      << "' is read while empty,"
-                      << " which may result in RTL simulation hanging."
-                      << std::endl;
-            elem = __STREAM_T__();
-        } else {
-            elem = _data.front();
-            _data.pop_front();
-        }
-        return elem;
-    }
-#endif
-
-    /// Blocking write
-    void write(const __STREAM_T__& tail) { 
-#ifdef HLS_STREAM_THREAD_SAFE
-        std::unique_lock<std::mutex> ul(_mutex);
-#endif
-        _data.push_back(tail);
-#ifdef HLS_STREAM_THREAD_SAFE
-        _condition_var.notify_one();
-#endif
-    }
-
-    /// Nonblocking read
-    bool read_nb(__STREAM_T__& head) {
-#ifdef HLS_STREAM_THREAD_SAFE
-        std::lock_guard<std::mutex> lg(_mutex);
-#endif    
-        bool is_empty = _data.empty();
-        if (is_empty) {
-            head = __STREAM_T__();
-        } else {
-            __STREAM_T__ elem(_data.front());
-            _data.pop_front();
-            head = elem;
-        }
-        return !is_empty;
-    }
-
-    /// Nonblocking write
-    bool write_nb(const __STREAM_T__& tail) {
-        bool is_full = full();
-        write(tail);
-        return !is_full;
-    }
-
-    /// Fifo size
-    size_t size() {
-        return _data.size();
-    }
-};
-
-} // namespace hls
-
-#endif // __cplusplus
-#endif  // X_HLS_STREAM_SIM_H
-
+/*
+#-  (c) Copyright 2011-2018 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef X_HLS_STREAM_SIM_H
+#define X_HLS_STREAM_SIM_H
+
+/*
+ * This file contains a C++ model of hls::stream.
+ * It defines C simulation model.
+ */
+#ifndef __cplusplus
+
+#error C++ is required to include this header file
+
+#else
+
+//////////////////////////////////////////////
+// C level simulation models for hls::stream
+//////////////////////////////////////////////
+#include <queue>
+#include <iostream>
+#include <typeinfo>
+#include <string>
+#include <sstream>
+
+#ifdef HLS_STREAM_THREAD_SAFE
+#include <mutex>
+#include <condition_variable>
+#endif
+
+#ifndef _MSC_VER
+#include <cxxabi.h>
+#include <stdlib.h>
+#endif
+
+namespace hls {
+
+template<typename __STREAM_T__>
+class stream
+{
+  protected:
+    std::string _name;
+    std::deque<__STREAM_T__> _data; // container for the elements
+#ifdef HLS_STREAM_THREAD_SAFE
+    std::mutex _mutex;
+    std::condition_variable _condition_var;
+#endif    
+
+  public:
+    /// Constructors
+    // Keep consistent with the synthesis model's constructors
+    stream() {
+        static unsigned _counter = 1;
+        std::stringstream ss;
+#ifndef _MSC_VER
+        char* _demangle_name = abi::__cxa_demangle(typeid(*this).name(), 0, 0, 0);
+        if (_demangle_name) {
+            _name = _demangle_name;
+            free(_demangle_name);
+        }
+        else {
+            _name = "hls_stream";
+        }
+#else
+        _name = typeid(*this).name();
+#endif
+
+        ss << _counter++;
+        _name += "." + ss.str();
+    }
+
+    stream(const std::string name) {
+    // default constructor,
+    // capacity set to predefined maximum
+        _name = name;
+    }
+
+  /// Make copy constructor and assignment operator private
+  private:
+    stream(const stream< __STREAM_T__ >& chn):
+        _name(chn._name), _data(chn._data) {
+    }
+
+    stream& operator = (const stream< __STREAM_T__ >& chn) {
+        _name = chn._name;
+        _data = chn._data;
+        return *this;
+    }
+
+  public:
+    /// Overload >> and << operators to implement read() and write()
+    void operator >> (__STREAM_T__& rdata) {
+        read(rdata);
+    }
+
+    void operator << (const __STREAM_T__& wdata) {
+        write(wdata);
+    }
+
+
+  public:
+    /// Destructor
+    /// Check status of the queue
+    virtual ~stream() {
+        if (!_data.empty())
+        {
+            std::cout << "WARNING: Hls::stream '" 
+                      << _name 
+                      << "' contains leftover data,"
+                      << " which may result in RTL simulation hanging."
+                      << std::endl;
+        }
+    }
+
+    /// Status of the queue
+    bool empty() {
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::lock_guard<std::mutex> lg(_mutex);
+#endif
+        return _data.empty();
+    }    
+
+    bool full() const { return false; }
+
+    /// Blocking read
+    void read(__STREAM_T__& head) {
+        head = read();
+    }
+
+#ifdef HLS_STREAM_THREAD_SAFE
+    __STREAM_T__ read() {
+        std::unique_lock<std::mutex> ul(_mutex);
+        while (_data.empty()) {
+            _condition_var.wait(ul);
+        }
+
+        __STREAM_T__ elem;
+        elem = _data.front();
+        _data.pop_front();
+        return elem;
+    }
+#else
+    __STREAM_T__ read() {
+        __STREAM_T__ elem;
+        if (_data.empty()) {
+            std::cout << "WARNING: Hls::stream '"
+                      << _name 
+                      << "' is read while empty,"
+                      << " which may result in RTL simulation hanging."
+                      << std::endl;
+            elem = __STREAM_T__();
+        } else {
+            elem = _data.front();
+            _data.pop_front();
+        }
+        return elem;
+    }
+#endif
+
+    /// Blocking write
+    void write(const __STREAM_T__& tail) { 
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::unique_lock<std::mutex> ul(_mutex);
+#endif
+        _data.push_back(tail);
+#ifdef HLS_STREAM_THREAD_SAFE
+        _condition_var.notify_one();
+#endif
+    }
+
+    /// Nonblocking read
+    bool read_nb(__STREAM_T__& head) {
+#ifdef HLS_STREAM_THREAD_SAFE
+        std::lock_guard<std::mutex> lg(_mutex);
+#endif    
+        bool is_empty = _data.empty();
+        if (is_empty) {
+            head = __STREAM_T__();
+        } else {
+            __STREAM_T__ elem(_data.front());
+            _data.pop_front();
+            head = elem;
+        }
+        return !is_empty;
+    }
+
+    /// Nonblocking write
+    bool write_nb(const __STREAM_T__& tail) {
+        bool is_full = full();
+        write(tail);
+        return !is_full;
+    }
+
+    /// Fifo size
+    size_t size() {
+        return _data.size();
+    }
+};
+
+} // namespace hls
+
+#endif // __cplusplus
+#endif  // X_HLS_STREAM_SIM_H
+
diff --git a/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h b/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h
index 759000ed78..3e751c36bf 100644
--- a/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h
+++ b/hls4ml/templates/vivado/ap_types/utils/x_hls_utils.h
@@ -1,80 +1,80 @@
-#ifndef X_HLS_UTILS_H
-#define X_HLS_UTILS_H
-#include "ap_fixed.h"
-#include <limits>
-
-namespace hls {
-
-    template<typename T>
-    class numeric_limits {
-    public:
-        static T max()     { return std::numeric_limits<T>::max(); }
-        static T min()     { return std::numeric_limits<T>::min(); }
-        static T epsilon() { return std::numeric_limits<T>::epsilon(); }
-    };
-
-    template <int W, int I, ap_q_mode Q, ap_o_mode O>
-    class numeric_limits<ap_fixed<W,I,Q,O> > {
-    public:
-        static ap_fixed<W,I,Q,O> max() {
-            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::max();
-            ap_fixed<W,I,Q,O> x;
-            x(W-1,0) = m(W-1,0);
-            return x;
-        }
-        static ap_fixed<W,I,Q,O> min() {
-            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::min();
-            ap_fixed<W,I,Q,O> x;
-            x(W-1,0) = m(W-1,0);
-            return x;
-        }
-        static ap_fixed<W,I,Q,O> epsilon() {
-          ap_fixed<W,I,Q,O> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-
-    template <int W, int I, ap_q_mode Q, ap_o_mode O>
-    class numeric_limits<ap_ufixed<W,I,Q,O> > {
-    public:
-        static ap_ufixed<W,I,Q,O> max() {
-            ap_uint<W> m = ::hls::numeric_limits<ap_uint<W> >::max();
-            ap_ufixed<W,I,Q,O> x;
-            x(W-1,0) = m(W-1,0);
-            return x;
-        }
-        static ap_ufixed<W,I,Q,O> min() { return 0; }
-        static ap_ufixed<W,I,Q,O> epsilon() {
-          ap_ufixed<W,I,Q,O> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-
-    template <int W>
-    class numeric_limits<ap_int<W> > {
-    public:
-        static ap_int<W> max() { ap_int<W> m = min(); return ~m; }
-        static ap_int<W> min() { ap_int<W> m = 0; m[W-1] = 1; return m; }
-        static ap_int<W> epsilon() {
-          ap_int<W> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-
-    template <int W>
-    class numeric_limits<ap_uint<W> > {
-    public:
-        static ap_uint<W> max() { ap_uint<W> zero = 0; return ~zero; }
-        static ap_uint<W> min() { return 0; }
-        static ap_uint<W> epsilon() {
-          ap_uint<W> x = 0;
-          x[0] = 1;
-          return x;
-        }
-    };
-}
-
-#endif
+#ifndef X_HLS_UTILS_H
+#define X_HLS_UTILS_H
+#include "ap_fixed.h"
+#include <limits>
+
+namespace hls {
+
+    template<typename T>
+    class numeric_limits {
+    public:
+        static T max()     { return std::numeric_limits<T>::max(); }
+        static T min()     { return std::numeric_limits<T>::min(); }
+        static T epsilon() { return std::numeric_limits<T>::epsilon(); }
+    };
+
+    template <int W, int I, ap_q_mode Q, ap_o_mode O>
+    class numeric_limits<ap_fixed<W,I,Q,O> > {
+    public:
+        static ap_fixed<W,I,Q,O> max() {
+            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::max();
+            ap_fixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_fixed<W,I,Q,O> min() {
+            ap_int<W> m = ::hls::numeric_limits<ap_int<W> >::min();
+            ap_fixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_fixed<W,I,Q,O> epsilon() {
+          ap_fixed<W,I,Q,O> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W, int I, ap_q_mode Q, ap_o_mode O>
+    class numeric_limits<ap_ufixed<W,I,Q,O> > {
+    public:
+        static ap_ufixed<W,I,Q,O> max() {
+            ap_uint<W> m = ::hls::numeric_limits<ap_uint<W> >::max();
+            ap_ufixed<W,I,Q,O> x;
+            x(W-1,0) = m(W-1,0);
+            return x;
+        }
+        static ap_ufixed<W,I,Q,O> min() { return 0; }
+        static ap_ufixed<W,I,Q,O> epsilon() {
+          ap_ufixed<W,I,Q,O> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W>
+    class numeric_limits<ap_int<W> > {
+    public:
+        static ap_int<W> max() { ap_int<W> m = min(); return ~m; }
+        static ap_int<W> min() { ap_int<W> m = 0; m[W-1] = 1; return m; }
+        static ap_int<W> epsilon() {
+          ap_int<W> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+
+    template <int W>
+    class numeric_limits<ap_uint<W> > {
+    public:
+        static ap_uint<W> max() { ap_uint<W> zero = 0; return ~zero; }
+        static ap_uint<W> min() { return 0; }
+        static ap_uint<W> epsilon() {
+          ap_uint<W> x = 0;
+          x[0] = 1;
+          return x;
+        }
+    };
+}
+
+#endif
diff --git a/hls4ml/templates/vivado/build_lib.sh b/hls4ml/templates/vivado/build_lib.sh
index 68e3538bcb..19f2d0a1c8 100755
--- a/hls4ml/templates/vivado/build_lib.sh
+++ b/hls4ml/templates/vivado/build_lib.sh
@@ -1,17 +1,17 @@
-#!/bin/bash
-
-CC=g++
-if [[ "$OSTYPE" == "linux-gnu" ]]; then
-    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
-elif [[ "$OSTYPE" == "darwin"* ]]; then
-    CFLAGS="-O3 -fPIC -std=c++11"
-fi
-LDFLAGS=
-INCFLAGS="-Ifirmware/ap_types/"
-PROJECT=myproject
-LIB_STAMP=mystamp
-
-${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
-${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
-${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
-rm -f *.o
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+LDFLAGS=
+INCFLAGS="-Ifirmware/ap_types/"
+PROJECT=myproject
+LIB_STAMP=mystamp
+
+${CC} ${CFLAGS} ${INCFLAGS} -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/vivado/firmware/myproject.cpp b/hls4ml/templates/vivado/firmware/myproject.cpp
index 74b58c5cb1..5ba7f118ba 100644
--- a/hls4ml/templates/vivado/firmware/myproject.cpp
+++ b/hls4ml/templates/vivado/firmware/myproject.cpp
@@ -1,23 +1,23 @@
-#include <iostream>
-
-#include "myproject.h"
-#include "parameters.h"
-
-// hls-fpga-machine-learning insert namespace-start
-
-void myproject(
-    // hls-fpga-machine-learning insert header
-) {
-
-    // hls-fpga-machine-learning insert IO
-
-    // hls-fpga-machine-learning insert load weights
-
-    // ****************************************
-    // NETWORK INSTANTIATION
-    // ****************************************
-
-    // hls-fpga-machine-learning insert layers
-}
-
-// hls-fpga-machine-learning insert namespace-end
+#include <iostream>
+
+#include "myproject.h"
+#include "parameters.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+void myproject(
+    // hls-fpga-machine-learning insert header
+) {
+
+    // hls-fpga-machine-learning insert IO
+
+    // hls-fpga-machine-learning insert load weights
+
+    // ****************************************
+    // NETWORK INSTANTIATION
+    // ****************************************
+
+    // hls-fpga-machine-learning insert layers
+}
+
+// hls-fpga-machine-learning insert namespace-end
diff --git a/hls4ml/templates/vivado/firmware/myproject.h b/hls4ml/templates/vivado/firmware/myproject.h
index a56778976b..5b34ae4c02 100644
--- a/hls4ml/templates/vivado/firmware/myproject.h
+++ b/hls4ml/templates/vivado/firmware/myproject.h
@@ -1,19 +1,19 @@
-#ifndef MYPROJECT_H_
-#define MYPROJECT_H_
-
-#include "ap_fixed.h"
-#include "ap_int.h"
-#include "hls_stream.h"
-
-#include "defines.h"
-
-// hls-fpga-machine-learning insert namespace-start
-
-// Prototype of top level function for C-synthesis
-void myproject(
-    // hls-fpga-machine-learning insert header
-);
-
-// hls-fpga-machine-learning insert namespace-end
-
-#endif
+#ifndef MYPROJECT_H_
+#define MYPROJECT_H_
+
+#include "ap_fixed.h"
+#include "ap_int.h"
+#include "hls_stream.h"
+
+#include "defines.h"
+
+// hls-fpga-machine-learning insert namespace-start
+
+// Prototype of top level function for C-synthesis
+void myproject(
+    // hls-fpga-machine-learning insert header
+);
+
+// hls-fpga-machine-learning insert namespace-end
+
+#endif
diff --git a/hls4ml/templates/vivado/myproject_test.cpp b/hls4ml/templates/vivado/myproject_test.cpp
index 29a4c816e5..814bb1f3e6 100644
--- a/hls4ml/templates/vivado/myproject_test.cpp
+++ b/hls4ml/templates/vivado/myproject_test.cpp
@@ -1,94 +1,94 @@
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
-#include "firmware/myproject.h"
-#include "firmware/nnet_utils/nnet_helpers.h"
-
-// hls-fpga-machine-learning insert bram
-
-#define CHECKPOINT 5000
-
-namespace nnet {
-bool trace_enabled = true;
-std::map<std::string, void *> *trace_outputs = NULL;
-size_t trace_type_size = sizeof(double);
-} // namespace nnet
-
-int main(int argc, char **argv) {
-    // hls-fpga-machine-learning insert namespace
-
-    // load input data from text file
-    std::ifstream fin("tb_data/tb_input_features.dat");
-    // load predictions from text file
-    std::ifstream fpr("tb_data/tb_output_predictions.dat");
-
-#ifdef RTL_SIM
-    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
-#else
-    std::string RESULTS_LOG = "tb_data/csim_results.log";
-#endif
-    std::ofstream fout(RESULTS_LOG);
-
-    std::string iline;
-    std::string pline;
-    int e = 0;
-
-    if (fin.is_open() && fpr.is_open()) {
-        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
-            if (e % CHECKPOINT == 0)
-                std::cout << "Processing input " << e << std::endl;
-            char *cstr = const_cast<char *>(iline.c_str());
-            char *current;
-            std::vector<float> in;
-            current = strtok(cstr, " ");
-            while (current != NULL) {
-                in.push_back(atof(current));
-                current = strtok(NULL, " ");
-            }
-            cstr = const_cast<char *>(pline.c_str());
-            std::vector<float> pr;
-            current = strtok(cstr, " ");
-            while (current != NULL) {
-                pr.push_back(atof(current));
-                current = strtok(NULL, " ");
-            }
-
-            // hls-fpga-machine-learning insert data
-
-            // hls-fpga-machine-learning insert top-level-function
-
-            if (e % CHECKPOINT == 0) {
-                std::cout << "Predictions" << std::endl;
-                // hls-fpga-machine-learning insert predictions
-                std::cout << "Quantized predictions" << std::endl;
-                // hls-fpga-machine-learning insert quantized
-            }
-            e++;
-
-            // hls-fpga-machine-learning insert tb-output
-        }
-        fin.close();
-        fpr.close();
-    } else {
-        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
-
-        // hls-fpga-machine-learning insert zero
-
-        // hls-fpga-machine-learning insert top-level-function
-
-        // hls-fpga-machine-learning insert output
-
-        // hls-fpga-machine-learning insert tb-output
-    }
-
-    fout.close();
-    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
-
-    return 0;
-}
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+#include "firmware/myproject.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+int main(int argc, char **argv) {
+    // hls-fpga-machine-learning insert namespace
+
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+#ifdef RTL_SIM
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+#else
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
+#endif
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+
+            // hls-fpga-machine-learning insert data
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+
+        // hls-fpga-machine-learning insert zero
+
+        // hls-fpga-machine-learning insert top-level-function
+
+        // hls-fpga-machine-learning insert output
+
+        // hls-fpga-machine-learning insert tb-output
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_array.h b/hls4ml/templates/vivado/nnet_utils/nnet_array.h
index de1b46c858..d179102a99 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_array.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_array.h
@@ -1,52 +1,52 @@
-#ifndef NNET_ARRAY_H_
-#define NNET_ARRAY_H_
-
-#include <math.h>
-
-namespace nnet {
-
-struct transpose_config {
-    static const unsigned height = 10;
-    static const unsigned width = 10;
-    static const unsigned depth = 10;
-    static constexpr unsigned perm[3] = {2, 0, 1};
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
-    #pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::height; i++) {
-        for (int j = 0; j < CONFIG_T::width; j++) {
-            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
-                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
-    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
-    unsigned dims_t[3];
-    dims_t[0] = dims[CONFIG_T::perm[0]];
-    dims_t[1] = dims[CONFIG_T::perm[1]];
-    dims_t[2] = dims[CONFIG_T::perm[2]];
-
-    int idx[3] = {0}, idx_t[3] = {0};
-    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
-        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
-            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
-                idx_t[0] = idx[CONFIG_T::perm[0]];
-                idx_t[1] = idx[CONFIG_T::perm[1]];
-                idx_t[2] = idx[CONFIG_T::perm[2]];
-
-                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
-                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_ARRAY_H_
+#define NNET_ARRAY_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct transpose_config {
+    static const unsigned height = 10;
+    static const unsigned width = 10;
+    static const unsigned depth = 10;
+    static constexpr unsigned perm[3] = {2, 0, 1};
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_2d(data_T data[CONFIG_T::height * CONFIG_T::width], res_T data_t[CONFIG_T::height * CONFIG_T::width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::height; i++) {
+        for (int j = 0; j < CONFIG_T::width; j++) {
+            data_t[j * CONFIG_T::height + i] = data[i * CONFIG_T::width + j];
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void transpose_3d(data_T data[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width],
+                  res_T data_t[CONFIG_T::depth * CONFIG_T::height * CONFIG_T::width]) {
+    unsigned dims[3] = {CONFIG_T::depth, CONFIG_T::height, CONFIG_T::width};
+    unsigned dims_t[3];
+    dims_t[0] = dims[CONFIG_T::perm[0]];
+    dims_t[1] = dims[CONFIG_T::perm[1]];
+    dims_t[2] = dims[CONFIG_T::perm[2]];
+
+    int idx[3] = {0}, idx_t[3] = {0};
+    for (idx[0] = 0; idx[0] < dims[0]; idx[0]++) {
+        for (idx[1] = 0; idx[1] < dims[1]; idx[1]++) {
+            for (idx[2] = 0; idx[2] < dims[2]; idx[2]++) {
+                idx_t[0] = idx[CONFIG_T::perm[0]];
+                idx_t[1] = idx[CONFIG_T::perm[1]];
+                idx_t[2] = idx[CONFIG_T::perm[2]];
+
+                data_t[idx_t[0] * dims_t[1] * dims_t[2] + idx_t[1] * dims_t[2] + idx_t[2]] =
+                    data[idx[0] * dims[1] * dims[2] + idx[1] * dims[2] + idx[2]];
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
index 3a029fe860..d8be45b73e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm.h
@@ -1,124 +1,124 @@
-#ifndef NNET_BATCHNORM_H_
-#define NNET_BATCHNORM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <math.h>
-
-namespace nnet {
-
-struct batchnorm_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float scale_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
-               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-    data_T cache;
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=scale,bias
-
-    // For parallel inputs:
-    //   - completely partition arrays -- target fabric
-    //   - if we have an unroll factor, limit number of multipliers
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    #pragma HLS ARRAY_PARTITION variable=scale complete
-    #pragma HLS ARRAY_PARTITION variable=bias complete
-
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-// Calcuate result
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
-        if (CONFIG_T::n_filt == -1) {
-            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
-                        bias[ires];
-        } else {
-            int norm_index = ires % CONFIG_T::n_filt;
-            res[ires] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
-                bias[norm_index];
-        }
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-struct batchnorm_quantized_tanh_config {
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_filt = -1;
-    static const unsigned n_scale_bias = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-};
-
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in],
-                           data_T threshold[CONFIG_T::n_scale_bias]) {
-    #pragma HLS PIPELINE
-    #pragma HLS ARRAY_PARTITION variable=res complete
-
-    data_T datareg;
-    ap_uint<1> cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg >= threshold[norm_index])
-            cache = 1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in],
-                            data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) {
-    #pragma HLS PIPELINE
-    #pragma HLS ARRAY_PARTITION variable=res complete
-
-    data_T datareg;
-    ap_int<2> cache;
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        datareg = data[ii];
-        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
-        if (datareg > threshold_hi[norm_index])
-            cache = 1;
-        else if (datareg <= threshold_lo[norm_index])
-            cache = -1;
-        else
-            cache = 0;
-
-        res[ii] = cache;
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_BATCHNORM_H_
+#define NNET_BATCHNORM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+struct batchnorm_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float scale_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
+               typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    data_T cache;
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=scale,bias
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Calcuate result
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
+        if (CONFIG_T::n_filt == -1) {
+            res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) +
+                        bias[ires];
+        } else {
+            int norm_index = ires % CONFIG_T::n_filt;
+            res[ires] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[norm_index]) +
+                bias[norm_index];
+        }
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+struct batchnorm_quantized_tanh_config {
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_filt = -1;
+    static const unsigned n_scale_bias = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+};
+
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(data_T data[CONFIG_T::n_in], ap_uint<1> res[CONFIG_T::n_in],
+                           data_T threshold[CONFIG_T::n_scale_bias]) {
+    #pragma HLS PIPELINE
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_uint<1> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg >= threshold[norm_index])
+            cache = 1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(data_T data[CONFIG_T::n_in], ap_int<2> res[CONFIG_T::n_in],
+                            data_T threshold_hi[CONFIG_T::n_scale_bias], data_T threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma HLS PIPELINE
+    #pragma HLS ARRAY_PARTITION variable=res complete
+
+    data_T datareg;
+    ap_int<2> cache;
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        datareg = data[ii];
+        int norm_index = CONFIG_T::n_filt == -1 ? ii : ii % CONFIG_T::n_filt;
+        if (datareg > threshold_hi[norm_index])
+            cache = 1;
+        else if (datareg <= threshold_lo[norm_index])
+            cache = -1;
+        else
+            cache = 0;
+
+        res[ii] = cache;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
index 0cd9565fb5..a064677d06 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_batchnorm_stream.h
@@ -1,123 +1,123 @@
-#ifndef NNET_BATCHNORM_STREAM_H_
-#define NNET_BATCHNORM_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include "nnet_types.h"
-
-namespace nnet {
-
-// ****************************************************
-//       Streaming Batch Normalization
-// ****************************************************
-
-template <class data_T, class res_T, typename CONFIG_T>
-void normalize(hls::stream<data_T> &data, hls::stream<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
-               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
-    #pragma HLS ARRAY_PARTITION variable=scale complete
-    #pragma HLS ARRAY_PARTITION variable=bias complete
-
-    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-BatchNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        #pragma HLS PIPELINE II=ii
-
-        data_T in_data = data.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormpack:
-        for (int j = 0; j < data_T::size; j++) {
-            #pragma HLS UNROLL
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
-                              in_data[j], scale[norm_index]) +
-                          bias[norm_index];
-        }
-
-        res.write(out_data);
-    }
-}
-
-// ****************************************************
-//       Merged Batch Normalization and Quantized Tanh
-// ****************************************************
-template <class data_T, typename CONFIG_T>
-void normalize_binary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias>> &res,
-                           typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
-    #pragma HLS ARRAY_PARTITION variable=threshold complete
-
-BinaryNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        data_T in_data = data.read();
-        nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias> out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormPack:
-        for (int j = 0; j < data_T::size; j++) {
-            #pragma HLS UNROLL
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class data_T, typename CONFIG_T>
-void normalize_ternary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_int<2>, CONFIG_T::n_scale_bias>> &res,
-                            typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
-                            typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
-    #pragma HLS ARRAY_PARTITION variable=threshold_hi complete
-    #pragma HLS ARRAY_PARTITION variable=threshold_lo complete
-
-TernaryNormLoop:
-    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        data_T in_data = data.read();
-        nnet::array<ap_int<2>, CONFIG_T::n_scale_bias> out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    BatchNormPack:
-        for (int j = 0; j < data_T::size; j++) {
-            #pragma HLS UNROLL
-
-            int norm_index;
-            if (CONFIG_T::n_filt == -1) {
-                norm_index = i * data_T::size + j;
-            } else {
-                norm_index = j % CONFIG_T::n_filt;
-            }
-
-            if (in_data[j] > threshold_hi[norm_index]) {
-                out_data[j] = 1;
-            } else if (in_data[j] <= threshold_lo[norm_index]) {
-                out_data[j] = -1;
-            } else {
-                out_data[j] = 0;
-            }
-        }
-
-        res.write(out_data);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_BATCHNORM_STREAM_H_
+#define NNET_BATCHNORM_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+
+template <class data_T, class res_T, typename CONFIG_T>
+void normalize(hls::stream<data_T> &data, hls::stream<res_T> &res, typename CONFIG_T::scale_t scale[CONFIG_T::n_scale_bias],
+               typename CONFIG_T::bias_t bias[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=scale complete
+    #pragma HLS ARRAY_PARTITION variable=bias complete
+
+    constexpr unsigned ii = CONFIG_T::n_in / CONFIG_T::multiplier_limit;
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+BatchNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE II=ii
+
+        data_T in_data = data.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormpack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(
+                              in_data[j], scale[norm_index]) +
+                          bias[norm_index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template <class data_T, typename CONFIG_T>
+void normalize_binary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias>> &res,
+                           typename data_T::value_type threshold[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=threshold complete
+
+BinaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_uint<1>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+            out_data[j] = (in_data[j] >= threshold[norm_index]) ? 1 : 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(hls::stream<data_T> &data, hls::stream<nnet::array<ap_int<2>, CONFIG_T::n_scale_bias>> &res,
+                            typename data_T::value_type threshold_hi[CONFIG_T::n_scale_bias],
+                            typename data_T::value_type threshold_lo[CONFIG_T::n_scale_bias]) {
+    #pragma HLS ARRAY_PARTITION variable=threshold_hi complete
+    #pragma HLS ARRAY_PARTITION variable=threshold_lo complete
+
+TernaryNormLoop:
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        data_T in_data = data.read();
+        nnet::array<ap_int<2>, CONFIG_T::n_scale_bias> out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    BatchNormPack:
+        for (int j = 0; j < data_T::size; j++) {
+            #pragma HLS UNROLL
+
+            int norm_index;
+            if (CONFIG_T::n_filt == -1) {
+                norm_index = i * data_T::size + j;
+            } else {
+                norm_index = j % CONFIG_T::n_filt;
+            }
+
+            if (in_data[j] > threshold_hi[norm_index]) {
+                out_data[j] = 1;
+            } else if (in_data[j] <= threshold_lo[norm_index]) {
+                out_data[j] = -1;
+            } else {
+                out_data[j] = 0;
+            }
+        }
+
+        res.write(out_data);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_common.h b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
index 7a65548bed..fed0395a1a 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_common.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_common.h
@@ -1,75 +1,75 @@
-#ifndef NNET_COMMON_H_
-#define NNET_COMMON_H_
-
-#include "ap_fixed.h"
-
-// This is a substitute for "ceil(n/(float)d)".
-#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
-#define MIN(n, d) (n > d ? d : n)
-#define MAX(n, d) (n > d ? n : d)
-
-#define STRINGIFY(x) #x
-#define EXPAND_STRING(x) STRINGIFY(x)
-
-#ifndef __VITIS_HLS__
-#define DATA_PACK_TXT HLS DATA_PACK variable =
-#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
-#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
-#else
-#define PRAGMA_DATA_PACK(variable)
-#endif
-
-namespace nnet {
-
-// Common type definitions
-enum io_type { io_parallel = 0, io_stream };
-enum strategy { latency, resource };
-
-/* ---
- * Balanced tree reduce implementation.
- * For use in scenarios where Vivado cannot expression balance
- * Reduces an array of inputs to a single value using the template binary operator 'Op',
- * for example summing all elements with Op_add, or finding the maximum with Op_max
- * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
- * before applying and accumulate the result over the rolled dimension.
- * --- */
-template <class T, int N, class Op> T reduce(const T *x, Op op) {
-    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
-    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
-    if (N == 1) {
-        return x[0];
-    }
-    if (N == 2) {
-        return op(x[0], x[1]);
-    }
-    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
-}
-
-template <class T> class Op_add {
-  public:
-    T operator()(T a, T b) { return a + b; }
-};
-
-template <class T> class Op_and {
-  public:
-    T operator()(T a, T b) { return a && b; }
-};
-
-template <class T> class Op_or {
-  public:
-    T operator()(T a, T b) { return a || b; }
-};
-
-template <class T> class Op_max {
-  public:
-    T operator()(T a, T b) { return a >= b ? a : b; }
-};
-
-template <class T> class Op_min {
-  public:
-    T operator()(T a, T b) { return a <= b ? a : b; }
-};
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMMON_H_
+#define NNET_COMMON_H_
+
+#include "ap_fixed.h"
+
+// This is a substitute for "ceil(n/(float)d)".
+#define DIV_ROUNDUP(n, d) ((n + d - 1) / d)
+#define MIN(n, d) (n > d ? d : n)
+#define MAX(n, d) (n > d ? n : d)
+
+#define STRINGIFY(x) #x
+#define EXPAND_STRING(x) STRINGIFY(x)
+
+#ifndef __VITIS_HLS__
+#define DATA_PACK_TXT HLS DATA_PACK variable =
+#define DATA_PACK_PRAGMA(variable) DATA_PACK_TXT variable
+#define PRAGMA_DATA_PACK(variable) _Pragma(EXPAND_STRING(DATA_PACK_PRAGMA(variable)))
+#else
+#define PRAGMA_DATA_PACK(variable)
+#endif
+
+namespace nnet {
+
+// Common type definitions
+enum io_type { io_parallel = 0, io_stream };
+enum strategy { latency, resource };
+
+/* ---
+ * Balanced tree reduce implementation.
+ * For use in scenarios where Vivado cannot expression balance
+ * Reduces an array of inputs to a single value using the template binary operator 'Op',
+ * for example summing all elements with Op_add, or finding the maximum with Op_max
+ * Use only when the input array is fully unrolled. Or, slice out a fully unrolled section
+ * before applying and accumulate the result over the rolled dimension.
+ * --- */
+template <class T, int N, class Op> T reduce(const T *x, Op op) {
+    static constexpr int leftN = pow2(floorlog2(N - 1)) > 0 ? pow2(floorlog2(N - 1)) : 0;
+    static constexpr int rightN = N - leftN > 0 ? N - leftN : 0;
+    if (N == 1) {
+        return x[0];
+    }
+    if (N == 2) {
+        return op(x[0], x[1]);
+    }
+    return op(reduce<T, leftN, Op>(x, op), reduce<T, rightN, Op>(x + leftN, op));
+}
+
+template <class T> class Op_add {
+  public:
+    T operator()(T a, T b) { return a + b; }
+};
+
+template <class T> class Op_and {
+  public:
+    T operator()(T a, T b) { return a && b; }
+};
+
+template <class T> class Op_or {
+  public:
+    T operator()(T a, T b) { return a || b; }
+};
+
+template <class T> class Op_max {
+  public:
+    T operator()(T a, T b) { return a >= b ? a : b; }
+};
+
+template <class T> class Op_min {
+  public:
+    T operator()(T a, T b) { return a <= b ? a : b; }
+};
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index 56617a4159..e2e0211b49 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -1,66 +1,66 @@
-#ifndef NNET_CONV1D_H_
-#define NNET_CONV1D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv1d_latency.h"
-#include "nnet_conv1d_resource.h"
-#include <cstdlib>
-
-namespace nnet {
-
-struct conv1d_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Convolutional parameters
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned in_width = 10;
-    static const unsigned n_chan = 0;
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_width;
-    static const unsigned n_filt = 1;
-    static const unsigned stride_width = 1;
-    static const unsigned dilation = 1;
-    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
-
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0; // not used yet
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS INLINE region
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
-                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    #pragma HLS INLINE region
-
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_CONV1D_H_
+#define NNET_CONV1D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv1d_latency.h"
+#include "nnet_conv1d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv1d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 0;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned dilation = 1;
+    static const unsigned out_width = 10; //(N_IN + PAD_LEFT * PAD_RIGHT - (DILATION * (FILT_WIDTH - 1) + 1)) / STRIDE + 1
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS INLINE region
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
index 431d3aa28d..b23c330c78 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_stream.h
@@ -1,89 +1,89 @@
-#ifndef NNET_CONV1D_STREAM_H_
-#define NNET_CONV1D_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_conv_stream.h"
-
-namespace nnet {
-
-template <class data_T, typename CONFIG_T>
-void compute_scaled_indices_1d(const unsigned w_idx, ap_uint<CONFIG_T::filt_width> *pixel_idx) {
-    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
-
-ComputeIndex:
-    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
-        #pragma HLS UNROLL
-        unsigned sw_idx =
-            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
-                wp_idx + p);
-        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-
-    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
-    const int win_depth = CONFIG_T::out_width;
-    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
-        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
-    }
-
-    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
-
-    res_T res_pack;
-    PRAGMA_DATA_PACK(res_pack)
-    unsigned outputs_ready = 0;
-
-    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
-    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
-
-ReadInputWidth:
-    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
-        #pragma HLS LOOP_FLATTEN
-        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
-            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        }
-        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
-        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
-                                                        biases, pixel_idx);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-
-ReadInputWidth:
-    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
-        #pragma HLS LOOP_FLATTEN
-        if (CONFIG_T::strategy == nnet::latency) {
-            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        }
-        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
-                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS inline recursive
-    switch (CONFIG_T::implementation) {
-    case conv_implementation::linebuffer:
-        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        break;
-    case conv_implementation::encoded:
-        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        break;
-    }
-}
-
-} // namespace nnet
-#endif
+#ifndef NNET_CONV1D_STREAM_H_
+#define NNET_CONV1D_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_conv_stream.h"
+
+namespace nnet {
+
+template <class data_T, typename CONFIG_T>
+void compute_scaled_indices_1d(const unsigned w_idx, ap_uint<CONFIG_T::filt_width> *pixel_idx) {
+    unsigned wp_idx = w_idx * (data_T::size / CONFIG_T::n_chan);
+
+ComputeIndex:
+    for (unsigned p = 0; p < data_T::size / CONFIG_T::n_chan; p++) {
+        #pragma HLS UNROLL
+        unsigned sw_idx =
+            CONFIG_T::template scale_index<CONFIG_T::filt_width, CONFIG_T::stride_width, CONFIG_T::in_width>::scale_index(
+                wp_idx + p);
+        pixel_idx[p] = CONFIG_T::pixels[sw_idx];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_encoded_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                        typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                        typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+    hls::stream<typename data_T::value_type> data_window[CONFIG_T::filt_width * CONFIG_T::n_chan];
+    const int win_depth = CONFIG_T::out_width;
+    for (unsigned i_out = 0; i_out < CONFIG_T::filt_width * CONFIG_T::n_chan; i_out++) {
+        #pragma HLS STREAM variable=data_window[i_out] depth=win_depth
+    }
+
+    #pragma HLS ARRAY_PARTITION variable=CONFIG_T::pixels complete
+
+    res_T res_pack;
+    PRAGMA_DATA_PACK(res_pack)
+    unsigned outputs_ready = 0;
+
+    ap_uint<CONFIG_T::filt_width> pixel_idx[data_T::size / CONFIG_T::n_chan];
+    #pragma HLS ARRAY_PARTITION variable=pixel_idx complete
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width / (data_T::size / CONFIG_T::n_chan); i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency && data_T::size / CONFIG_T::n_chan == 1) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_scaled_indices_1d<data_T, CONFIG_T>(i_iw, pixel_idx);
+        compute_output_encoded<data_T, res_T, CONFIG_T>(data.read(), data_window, res, res_pack, outputs_ready, weights,
+                                                        biases, pixel_idx);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_buffer_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                       typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                       typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+
+ReadInputWidth:
+    for (unsigned i_iw = 0; i_iw < CONFIG_T::in_width; i_iw++) {
+        #pragma HLS LOOP_FLATTEN
+        if (CONFIG_T::strategy == nnet::latency) {
+            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+        }
+        compute_output_buffer_1d<data_T, res_T, CONFIG_T>(data.read(), res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_1d_cl(hls::stream<data_T> &data, hls::stream<res_T> &res,
+                typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+                typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS inline recursive
+    switch (CONFIG_T::implementation) {
+    case conv_implementation::linebuffer:
+        conv_1d_buffer_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    case conv_implementation::encoded:
+        conv_1d_encoded_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        break;
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h
index c7c4158c6a..71a88f4483 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d.h
@@ -1,75 +1,75 @@
-#ifndef NNET_CONV2D_H_
-#define NNET_CONV2D_H_
-
-#include "nnet_common.h"
-#include "nnet_conv2d_latency.h"
-#include "nnet_conv2d_resource.h"
-#include <cstdlib>
-
-namespace nnet {
-
-struct conv2d_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Convolutional parameters
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned n_chan = 1;
-    static const unsigned filt_height = 1;
-    static const unsigned filt_width = 1;
-    static const unsigned kernel_size = filt_height * filt_width;
-    static const unsigned n_filt = 1;
-    static const unsigned stride_height = 1;
-    static const unsigned stride_width = 1;
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-    static const unsigned dilation_height = 1;
-    static const unsigned dilation_width = 1;
-
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0; // not used yet
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_cl(
-    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    #pragma HLS INLINE region
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
-                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::filt_width == 1);
-
-    #pragma HLS INLINE region
-
-    // Nothing special to be done for io_parallel implementation
-    if (CONFIG_T::strategy == nnet::latency) {
-        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_CONV2D_H_
+#define NNET_CONV2D_H_
+
+#include "nnet_common.h"
+#include "nnet_conv2d_latency.h"
+#include "nnet_conv2d_resource.h"
+#include <cstdlib>
+
+namespace nnet {
+
+struct conv2d_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Convolutional parameters
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_chan = 1;
+    static const unsigned filt_height = 1;
+    static const unsigned filt_width = 1;
+    static const unsigned kernel_size = filt_height * filt_width;
+    static const unsigned n_filt = 1;
+    static const unsigned stride_height = 1;
+    static const unsigned stride_width = 1;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned dilation_height = 1;
+    static const unsigned dilation_width = 1;
+
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0; // not used yet
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    #pragma HLS INLINE region
+
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+                          res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+                          typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                          typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::filt_width == 1);
+
+    #pragma HLS INLINE region
+
+    // Nothing special to be done for io_parallel implementation
+    if (CONFIG_T::strategy == nnet::latency) {
+        conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        conv_2d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
index 3a481711db..5114af7825 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv2d_latency.h
@@ -1,89 +1,89 @@
-#ifndef NNET_CONV2D_LATENCY_H_
-#define NNET_CONV2D_LATENCY_H_
-
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <cstdlib>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void conv_2d_latency_cl(
-    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
-    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
-    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
-    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
-    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
-
-    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
-    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
-
-    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
-    #pragma HLS ARRAY_PARTITION variable=mult complete
-
-    typename CONFIG_T::accum_t acc[mult_n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-    #pragma HLS ARRAY_PARTITION variable=weights complete
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    // Limit multipliers to control parallelization
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
-
-PartitionLoop:
-    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
-
-        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
-
-    PixelLoop:
-        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
-            #pragma HLS UNROLL
-
-            data_T cache;
-
-        // Do the matrix-multiply
-        Product1:
-            for (int i_in = 0; i_in < mult_n_in; i_in++) {
-                #pragma HLS UNROLL
-                cache = data_buf[i_pxl][i_in];
-            Product2:
-                for (int i_out = 0; i_out < mult_n_out; i_out++) {
-                    #pragma HLS UNROLL
-                    mult[i_in * mult_n_out + i_out] =
-                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
-                            cache, weights[i_in * mult_n_out + i_out]);
-                }
-            }
-
-        // Initialize accumulator with input biases
-        ResetAccum:
-            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
-                #pragma HLS UNROLL
-                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
-            }
-
-        // Accumulate multiplication result
-        Accum1:
-            for (int i_in = 0; i_in < mult_n_in; i_in++) {
-                #pragma HLS UNROLL
-            Accum2:
-                for (int i_out = 0; i_out < mult_n_out; i_out++) {
-                    #pragma HLS UNROLL
-                    acc[i_out] += mult[i_in * mult_n_out + i_out];
-                }
-            }
-
-        // Cast to "res_t" type
-        Result:
-            for (int i_res = 0; i_res < mult_n_out; i_res++) {
-                #pragma HLS UNROLL
-                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
-            }
-        }
-    }
-}
-
-} // namespace nnet
-#endif
+#ifndef NNET_CONV2D_LATENCY_H_
+#define NNET_CONV2D_LATENCY_H_
+
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <cstdlib>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void conv_2d_latency_cl(
+    data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_chan],
+    res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt],
+    typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
+    typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+    constexpr unsigned mult_n_in = CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan;
+    constexpr unsigned mult_n_out = CONFIG_T::n_filt;
+
+    data_T data_buf[CONFIG_T::n_pixels][mult_n_in];
+    #pragma HLS ARRAY_PARTITION variable=data_buf complete dim=0
+
+    typename CONFIG_T::accum_t mult[mult_n_in * mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    typename CONFIG_T::accum_t acc[mult_n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ARRAY_PARTITION variable=weights complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    // Limit multipliers to control parallelization
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::mult_config::multiplier_limit
+
+PartitionLoop:
+    for (int i_part = 0; i_part < CONFIG_T::n_partitions; i_part++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor rewind
+
+        CONFIG_T::template fill_buffer<data_T, CONFIG_T>::fill_buffer(data, data_buf, i_part);
+
+    PixelLoop:
+        for (unsigned i_pxl = 0; i_pxl < CONFIG_T::n_pixels; i_pxl++) {
+            #pragma HLS UNROLL
+
+            data_T cache;
+
+        // Do the matrix-multiply
+        Product1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+                cache = data_buf[i_pxl][i_in];
+            Product2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    mult[i_in * mult_n_out + i_out] =
+                        CONFIG_T::mult_config::template product<data_T, typename CONFIG_T::mult_config::weight_t>::product(
+                            cache, weights[i_in * mult_n_out + i_out]);
+                }
+            }
+
+        // Initialize accumulator with input biases
+        ResetAccum:
+            for (int i_acc = 0; i_acc < mult_n_out; i_acc++) {
+                #pragma HLS UNROLL
+                acc[i_acc] = (typename CONFIG_T::accum_t)biases[i_acc];
+            }
+
+        // Accumulate multiplication result
+        Accum1:
+            for (int i_in = 0; i_in < mult_n_in; i_in++) {
+                #pragma HLS UNROLL
+            Accum2:
+                for (int i_out = 0; i_out < mult_n_out; i_out++) {
+                    #pragma HLS UNROLL
+                    acc[i_out] += mult[i_in * mult_n_out + i_out];
+                }
+            }
+
+        // Cast to "res_t" type
+        Result:
+            for (int i_res = 0; i_res < mult_n_out; i_res++) {
+                #pragma HLS UNROLL
+                *(res++) = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[i_res]);
+            }
+        }
+    }
+}
+
+} // namespace nnet
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
index 5494ab4e36..029b74803b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_compressed.h
@@ -1,90 +1,90 @@
-#ifndef NNET_COMPRESSED_LAYER_H_
-#define NNET_COMPRESSED_LAYER_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include <math.h>
-
-namespace nnet {
-
-template <typename CONFIG_T>
-void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
-               typename CONFIG_T::accum_t weight) {
-    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
-        #pragma HLS UNROLL
-        if (k == index)
-            mult[k] += weight;
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
-                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc    complete
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
-
-#ifdef __VITIS_HLS__
-    #pragma HLS AGGREGATE variable=weights
-#else
-    #pragma HLS data_pack variable=weights struct_level
-#endif
-
-InitAccum:
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        #pragma HLS UNROLL
-        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
-    }
-
-    // Do the compressed matrix-multiply
-    const int rufactor = CONFIG_T::reuse_factor;
-ReuseLoop:
-    for (unsigned ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE  II=1 rewind
-
-        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
-        #pragma HLS ARRAY_PARTITION variable=mult complete
-
-    ResetMult:
-        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
-            #pragma HLS UNROLL
-            mult[imult] = 0;
-        }
-
-    CompressedMultLoop:
-        for (unsigned im = 0; im < multiplier_limit; im++) {
-            #pragma HLS UNROLL
-            unsigned w = im * rufactor + ir;
-            auto row = weights[w].row_index;
-            auto col = weights[w].col_index;
-            auto weight_cache = weights[w].weight;
-            data_T data_cache = data[row];
-            // mult[col] += weight_cache * data_cache;
-            typename CONFIG_T::accum_t prod =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
-            fill_mult<CONFIG_T>(col, mult, prod);
-        }
-
-        for (int im = 0; im < CONFIG_T::n_out; im++) {
-            acc[im] += mult[im];
-        }
-    }
-
-// Cast to "res_t" type
-ResultLoop:
-    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
-        #pragma HLS UNROLL
-        // res[i] = (res_T) (acc[i]);
-        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_COMPRESSED_LAYER_H_
+#define NNET_COMPRESSED_LAYER_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include <math.h>
+
+namespace nnet {
+
+template <typename CONFIG_T>
+void fill_mult(typename CONFIG_T::index_t index, typename CONFIG_T::accum_t mult[CONFIG_T::n_out],
+               typename CONFIG_T::accum_t weight) {
+    for (unsigned k = 0; k < CONFIG_T::n_out; k++) {
+        #pragma HLS UNROLL
+        if (k == index)
+            mult[k] += weight;
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_compressed(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                      typename CONFIG_T::weight_t weights[CONFIG_T::n_nonzeros],
+                      typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_nonzeros, CONFIG_T::reuse_factor);
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc    complete
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=multiplier_limit
+
+#ifdef __VITIS_HLS__
+    #pragma HLS AGGREGATE variable=weights
+#else
+    #pragma HLS data_pack variable=weights struct_level
+#endif
+
+InitAccum:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        #pragma HLS UNROLL
+        acc[i] = (typename CONFIG_T::accum_t)(biases[i]);
+    }
+
+    // Do the compressed matrix-multiply
+    const int rufactor = CONFIG_T::reuse_factor;
+ReuseLoop:
+    for (unsigned ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE  II=1 rewind
+
+        typename CONFIG_T::accum_t mult[CONFIG_T::n_out];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < CONFIG_T::n_out; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    CompressedMultLoop:
+        for (unsigned im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            unsigned w = im * rufactor + ir;
+            auto row = weights[w].row_index;
+            auto col = weights[w].col_index;
+            auto weight_cache = weights[w].weight;
+            data_T data_cache = data[row];
+            // mult[col] += weight_cache * data_cache;
+            typename CONFIG_T::accum_t prod =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data_cache, weight_cache);
+            fill_mult<CONFIG_T>(col, mult, prod);
+        }
+
+        for (int im = 0; im < CONFIG_T::n_out; im++) {
+            acc[im] += mult[im];
+        }
+    }
+
+// Cast to "res_t" type
+ResultLoop:
+    for (unsigned i = 0; i < CONFIG_T::n_out; i++) {
+        #pragma HLS UNROLL
+        // res[i] = (res_T) (acc[i]);
+        res[i] = cast<data_T, res_T, CONFIG_T>(acc[i]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
index c31958d3e5..02802c45a9 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_latency.h
@@ -1,72 +1,72 @@
-#ifndef NNET_DENSE_LATENCY_H_
-#define NNET_DENSE_LATENCY_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    data_T cache;
-    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-
-    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
-    #pragma HLS function_instantiate variable=weights,biases
-
-    // For parallel inputs:
-    //   - completely partition arrays -- target fabric
-    //   - if we have an unroll factor, limit number of multipliers
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-    #pragma HLS ARRAY_PARTITION variable=mult complete
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-// Do the matrix-multiply
-Product1:
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        cache = data[ii];
-    Product2:
-        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
-            int index = ii * CONFIG_T::n_out + jj;
-            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
-        }
-    }
-
-// Initialize accumulator with input biases
-ResetAccum:
-    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-// Accumulate multiplication result
-Accum1:
-    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-    Accum2:
-        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
-            int index = ii * CONFIG_T::n_out + jj;
-            acc[jj] += mult[index];
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        // res[ires] = (res_T) (acc[ires]);
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_DENSE_LATENCY_H_
+#define NNET_DENSE_LATENCY_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_latency(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    data_T cache;
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in * CONFIG_T::n_out];
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+
+    // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
+    #pragma HLS function_instantiate variable=weights,biases
+
+    // For parallel inputs:
+    //   - completely partition arrays -- target fabric
+    //   - if we have an unroll factor, limit number of multipliers
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+// Do the matrix-multiply
+Product1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        cache = data[ii];
+    Product2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            mult[index] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(cache, weights[index]);
+        }
+    }
+
+// Initialize accumulator with input biases
+ResetAccum:
+    for (int iacc = 0; iacc < CONFIG_T::n_out; iacc++) {
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+// Accumulate multiplication result
+Accum1:
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+    Accum2:
+        for (int jj = 0; jj < CONFIG_T::n_out; jj++) {
+            int index = ii * CONFIG_T::n_out + jj;
+            acc[jj] += mult[index];
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        // res[ires] = (res_T) (acc[ires]);
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
index 1ff33a34fb..88de94729b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
@@ -1,263 +1,263 @@
-#ifndef NNET_DENSE_RESOURCE_H_
-#define NNET_DENSE_RESOURCE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <assert.h>
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = CONFIG_T::reuse_factor;
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-
-        int w_index = ir;
-        int in_index = ir;
-        int out_index = 0;
-        int acc_step = 0;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
-
-            // Increment w_index
-            w_index += rufactor;
-            // Increment in_index
-            in_index += rufactor;
-            if (in_index >= nin) {
-                in_index = ir;
-            }
-            // Increment out_index
-            if (acc_step + 1 >= multscale) {
-                acc_step = 0;
-                out_index++;
-            } else {
-                acc_step++;
-            }
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-    int w_index;
-    int in_index = 0;
-    int out_index;
-    int outstep = 0;
-    const int outscale = rufactor / nin;
-
-    int outidx[rufactor];
-IndexLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        outidx[ir] = outstep;
-        if ((ir + 1) % nin == 0) {
-            outstep++;
-        }
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-
-        w_index = ir;
-        out_index = outidx[ir] /*outstep*/;
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
-
-            w_index += rufactor;
-            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
-                break; // check out of bounds
-            out_index += outscale;
-        }
-
-        in_index++;
-        if (in_index >= nin) {
-            in_index = 0;
-            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    const int rufactor = CONFIG_T::reuse_factor;
-    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
-    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
-    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
-    const int multscale = multiplier_limit / CONFIG_T::n_out;
-    const int nin = CONFIG_T::n_in;
-    const int nout = CONFIG_T::n_out;
-
-    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
-    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
-
-    #pragma HLS function_instantiate variable=weights,biases
-    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
-    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
-    #pragma HLS ARRAY_PARTITION variable=biases complete
-
-    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
-    #pragma HLS ARRAY_PARTITION variable=acc complete
-
-InitAccum:
-    for (int iacc = 0; iacc < nout; iacc++) {
-        #pragma HLS UNROLL
-        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
-    }
-
-ReuseLoop:
-    for (int ir = 0; ir < rufactor; ir++) {
-        #pragma HLS PIPELINE II=1 rewind
-        typename CONFIG_T::accum_t tmpmult[block_factor];
-        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
-
-    MultLoop:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            int w_index = ir + rufactor * im;
-            int in_index = w_index % nin;
-            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
-                continue; // check out of bounds
-            tmpmult[im] =
-                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
-        }
-
-        typename CONFIG_T::accum_t mult[multiplier_limit];
-        #pragma HLS ARRAY_PARTITION variable=mult complete
-
-    ResetMult:
-        for (int imult = 0; imult < multiplier_limit; imult++) {
-            #pragma HLS UNROLL
-            mult[imult] = 0;
-        }
-
-    AccumLoop1:
-        for (int im = 0; im < block_factor; im++) {
-            #pragma HLS UNROLL
-            int w_index = ir + rufactor * im;
-            int out_index = w_index / multfactor;
-            if (out_index >= multiplier_limit)
-                continue; // check out of bounds
-            mult[out_index] += tmpmult[im];
-        }
-
-    AccumLoop2:
-        for (int im = 0; im < multiplier_limit; im++) {
-            #pragma HLS UNROLL
-            // int out_index = im/multscale; // This is the general case
-            // acc[out_index] += mult[im];
-            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
-        }
-    }
-
-// Cast to "res_t" type
-Result:
-    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
-        #pragma HLS UNROLL
-        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
-                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-
-    #pragma HLS INLINE recursive
-
-    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
-        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_DENSE_RESOURCE_H_
+#define NNET_DENSE_RESOURCE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <assert.h>
+#include <math.h>
+
+namespace nnet {
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_leq_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int w_index = ir;
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            // Increment w_index
+            w_index += rufactor;
+            // Increment in_index
+            in_index += rufactor;
+            if (in_index >= nin) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin_rem0(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+    int w_index;
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = rufactor / nin;
+
+    int outidx[rufactor];
+IndexLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % nin == 0) {
+            outstep++;
+        }
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        w_index = ir;
+        out_index = outidx[ir] /*outstep*/;
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            acc[out_index] += static_cast<typename CONFIG_T::accum_t>(
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]));
+
+            w_index += rufactor;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                break; // check out of bounds
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= nin) {
+            in_index = 0;
+            // outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource_rf_gt_nin(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                              typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                              typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in * CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit / CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t)biases[iacc];
+    }
+
+ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+    MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int in_index = w_index % nin;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out)
+                continue; // check out of bounds
+            tmpmult[im] =
+                CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::product(data[in_index], weights[w_index]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+    ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+    AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int out_index = w_index / multfactor;
+            if (out_index >= multiplier_limit)
+                continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+    AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            // int out_index = im/multscale; // This is the general case
+            // acc[out_index] += mult[im];
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+// Cast to "res_t" type
+Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        res[ires] = cast<data_T, res_T, CONFIG_T>(acc[ires]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense_resource(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
+                    typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+                    typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+
+    #pragma HLS INLINE recursive
+
+    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h b/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h
index fb5fe72fcf..1fcd554598 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_garnet.h
@@ -1,816 +1,816 @@
-#ifndef NNET_GARNET_H_
-#define NNET_GARNET_H_
-
-#include "hls_math.h"
-#include "hls_stream.h"
-#include "nnet_common.h"
-
-namespace nnet {
-namespace garnet_utils {
-
-template <class CONFIG_T>
-inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
-initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    typedef ap_uint<CONFIG_T::distance_width> index_t;
-
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-
-    index_t index;
-    typename CONFIG_T::distance_t distance;
-
-    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
-    edge_weights_table[0] = ap_ufixed<CONFIG_T::edge_weight_t::width, 0, AP_TRN, AP_SAT>(1.);
-
-    for (unsigned iw = 1; iw < table_size; ++iw) {
-        index = iw;
-        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
-        edge_weights_table[iw] = hls::exp(-distance * distance);
-    }
-}
-
-template <class CONFIG_T>
-inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
-initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-    double const step = 64. / table_size;
-
-    typename CONFIG_T::distance_t v = -32.;
-    for (unsigned iw = 0; iw < table_size; ++iw) {
-        edge_weights_table[iw] = std::exp(-v * v);
-        v += step;
-    }
-}
-
-template <class CONFIG_T>
-inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
-get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    typedef ap_uint<CONFIG_T::distance_width> index_t;
-
-    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
-
-    return edge_weights_table[index];
-}
-
-template <class CONFIG_T>
-inline
-    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
-    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
-    unsigned const table_size = (1 << CONFIG_T::distance_width);
-    double const step = 64. / table_size;
-
-    int index = (distance + 32.) / step;
-    if (index < 0)
-        index = 0;
-    else if (index >= table_size)
-        index = table_size - 1;
-
-    return edge_weights_table[index];
-}
-
-template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
-    if (CONFIG_T::is_stack) {
-        #pragma HLS INLINE OFF
-    }
-#ifdef __SYNTHESIS__
-    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
-    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
-    // CONFIG_T::reuse_factor);
-    // #pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
-    bool initialized = false;
-#else
-    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
-    static bool initialized = false;
-#endif
-    if (not initialized) {
-        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
-        initialized = true;
-    }
-
-    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
-}
-
-template <class dividend_T, class exponent_T>
-inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
-                                                                                                  exponent_T exponent) {
-    #pragma HLS INLINE
-    return dividend >> exponent;
-}
-
-template <class dividend_T, class exponent_T>
-inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
-                                                                                                      exponent_T exponent) {
-    #pragma HLS INLINE
-    return dividend / std::pow(2., exponent);
-}
-
-template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
-    typedef E edge_weight_t;
-
-    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
-    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
-
-    Means() {
-        #pragma HLS INLINE
-        #pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
-        #pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
-        #pragma HLS UNROLL region
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] = 0.;
-
-        InFeatures:
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-                weighted_feature_mean[iax] = 0.;
-            }
-        }
-    }
-
-    void set_weight(unsigned, edge_weight_t const &) {
-        #pragma HLS INLINE
-    }
-
-    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
-        #pragma HLS INLINE
-        // Always called within a pipelined region - no UNROLL needed
-
-        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
-
-        InFeatures:
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
-            }
-        }
-    }
-
-    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
-    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
-        #pragma HLS INLINE
-        #pragma HLS UNROLL region
-
-        // accum comes divided by unroll factor
-        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
-
-    Aggregators:
-        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
-            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
-
-        InFeatures:
-            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
-                unsigned const iax = ia * T::n_in_features + ix;
-
-                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
-            }
-        }
-    }
-
-    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
-    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
-        #pragma HLS INLINE
-        #pragma HLS UNROLL region
-
-    Aggregators:
-        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
-
-            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
-
-        InFeatures:
-            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
-                unsigned const iax = ia * T::n_in_features + ix;
-
-                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
-            }
-        }
-    }
-};
-
-template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
-    typedef E edge_weight_t;
-
-    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
-
-    WeightsAndMeans() : Means<CONFIG_T, E>() {
-        #pragma HLS INLINE
-        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
-        #pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
-    }
-
-    void set_weight(unsigned iva, edge_weight_t const &weight) {
-        #pragma HLS INLINE
-        edge_weights[iva] = weight;
-    }
-};
-
-template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
-
-template <class CONFIG_T, class nvtx_T>
-struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
-    typedef typename CONFIG_T::output_transform_biases_t biases_t;
-
-    biases_t const (&output_biases)[CONFIG_T::n_out_features];
-
-    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
-        #pragma HLS INLINE
-    }
-};
-
-template <class CONFIG_T, class nvtx_T>
-struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
-    typedef typename CONFIG_T::output_transform_biases_t biases_t;
-
-    biases_t output_biases[CONFIG_T::n_out_features];
-
-    OutputBiasNormalizer(nvtx_T const nvtx) {
-        #pragma HLS ARRAY_PARTITION variable=output_biases complete
-        #pragma HLS UNROLL region
-
-        // Cannot add a loop label here due to a Vivado HLS bug, apparently
-        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
-            bias *= nvtx;
-            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
-        }
-    }
-};
-
-template <class CONFIG_T, class data_T> struct InputDataGetter {
-    typedef data_T data_t;
-
-    data_T const *dataref;
-
-    InputDataGetter(data_T const *d) : dataref{d} {
-        #pragma HLS INLINE
-    }
-    data_T const &get(unsigned iv, unsigned ix) const {
-        #pragma HLS INLINE
-        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-        return dataref[ivx];
-    }
-};
-
-template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
-    typedef data_T data_t;
-
-    data_T const (&dataref)[CONFIG_T::n_in_features];
-
-    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
-        #pragma HLS INLINE
-    }
-    data_T const &get(unsigned, unsigned ix) const {
-        #pragma HLS INLINE
-        return dataref[ix];
-    }
-};
-
-template <class CONFIG_T, class res_T> struct OutputResSetter {
-    typedef res_T res_t;
-
-    res_T *resref;
-
-    OutputResSetter(res_T *r) : resref{r} {
-        #pragma HLS INLINE
-    }
-    void set(unsigned iv, unsigned io, res_T const &acc) {
-        #pragma HLS INLINE
-        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-        resref[ivo] = acc;
-    }
-};
-
-template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
-    typedef res_T res_t;
-
-    res_T (&resref)[CONFIG_T::n_out_features];
-
-    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
-        #pragma HLS INLINE
-    }
-    void set(unsigned, unsigned io, res_T const &acc) {
-        #pragma HLS INLINE
-        resref[io] = acc;
-    }
-};
-
-template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
-inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
-                                       arrays_T &arrays) {
-    #pragma HLS INLINE
-
-Aggregators:
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
-
-    InFeatures1:
-        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
-
-            distance += incr;
-        }
-
-        typename CONFIG_T::edge_weight_t edge_weight =
-            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
-
-        arrays_local.edge_weight_mean[ia] += edge_weight;
-
-    InFeatures2:
-        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
-
-            arrays_local.weighted_feature_mean[iax] += incr;
-        }
-
-        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-        arrays.set_weight(iva, edge_weight);
-    }
-}
-
-template <class CONFIG_T, class arrays_T>
-inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
-    #pragma HLS INLINE
-    #pragma HLS UNROLL region
-
-    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
-
-InFeatures:
-    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
-        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
-    }
-
-    return aggr;
-}
-
-template <class CONFIG_T, class arrays_T>
-inline void compute_output_base(arrays_T const &arrays,
-                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
-    #pragma HLS INLINE
-    #pragma HLS UNROLL region
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
-        }
-    }
-}
-
-template <class CONFIG_T, class arrays_T, class res_setter_T>
-inline void
-compute_vertex_output(arrays_T const &arrays, unsigned iv,
-                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
-                      res_setter_T &res_setter) {
-    #pragma HLS INLINE
-
-    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
-    #pragma HLS ARRAY_PARTITION variable=edge_weights complete
-
-Aggregators1:
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-
-        edge_weights[ia] = arrays.edge_weights[iva];
-    }
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
-
-    Aggregators2:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
-            acc += incr;
-        }
-
-        res_setter.set(iv, io, acc);
-    }
-}
-
-template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
-void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
-    InputDataGetter<CONFIG_T, data_T> data_getter(data);
-
-    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
-
-    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
-        #pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
-        }
-
-        means_accum.add_means_normalized(means_local);
-    }
-
-    arrays.set_means_normalized(nvtx, means_accum);
-}
-
-template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
-void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    OutputResSetter<CONFIG_T, res_T> res_setter(res);
-
-    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
-    #pragma HLS ARRAY_PARTITION variable=output_base complete
-
-    compute_output_base<CONFIG_T>(arrays, output_base);
-
-    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
-        #pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
-        }
-    }
-}
-
-template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
-void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
-                res_T res[CONFIG_T::n_out_features]) {
-    #pragma HLS PIPELINE
-
-OutFeatures:
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        res_T acc = output_transform_biases.output_biases[io];
-
-    Aggregators:
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
-
-            acc += arrays.edge_weight_mean[ia] * aggr;
-        }
-
-        res[io] = acc;
-    }
-}
-
-template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
-void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
-    typedef typename prev_layer_t::output_t data_T;
-
-    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
-    #pragma HLS ARRAY_PARTITION variable=prev_output_base complete
-
-    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
-
-    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
-
-    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
-
-VerticesOuter:
-    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
-        #pragma HLS PIPELINE
-
-        if (ivv * unroll_factor >= nvtx)
-            break;
-
-        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
-
-    VerticesInner:
-        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
-            unsigned iv = ivv * unroll_factor + ir;
-
-            if (iv == nvtx)
-                break;
-
-            data_T data[prev_layer_t::n_out_features];
-            #pragma HLS ARRAY_PARTITION variable=data complete
-
-            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
-
-            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
-
-            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
-
-            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
-        }
-
-        means_accum.add_means_normalized(means_local);
-    }
-
-    current_arrays.set_means_normalized(nvtx, means_accum);
-}
-
-template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
-          class last_arrays_T>
-inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
-sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
-    #pragma HLS INLINE
-
-    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
-}
-
-template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
-          class last_arrays_T>
-inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
-sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
-    #pragma HLS INLINE
-
-    WeightsAndMeans<current_layer_t> current_arrays;
-
-    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
-
-    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
-}
-} // namespace garnet_utils
-
-struct garnet_config {
-    // Layer specs
-    static const unsigned n_vertices_width = 8;
-    static const unsigned n_vertices = (1 << n_vertices_width);
-    static const unsigned n_in_features = 4;
-    static const unsigned n_propagate = 4;
-    static const unsigned n_aggregators = 4;
-    static const unsigned n_out_features = 4;
-    static const unsigned distance_width = 12;
-
-    // Internal data type definitions
-    typedef float input_transform_weights_t;
-    typedef float input_transform_biases_t;
-    typedef float output_transform_weights_t;
-    typedef float output_transform_biases_t;
-    typedef float aggregator_distance_weights_t;
-    typedef float aggregator_distance_biases_t;
-
-    typedef float norm_t;
-    typedef float distance_t;
-    typedef float edge_weight_t;
-    typedef float edge_weight_aggr_t;
-    typedef float aggr_t;
-    typedef float output_t;
-
-    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
-    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
-    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
-    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
-    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
-
-    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
-
-    static const unsigned output_collapse = no_collapse;
-
-    static const bool mean_by_nvert = false;
-    static const bool is_stack = false;
-
-    // Optimization specs
-    static const unsigned reuse_factor = 64;
-    static const unsigned log2_reuse_factor = 6;
-};
-
-// vertices -> vertices
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
-
-    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
-
-    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
-}
-
-// vertices -> out features
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-       res_T res[CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    garnet_utils::Means<CONFIG_T> arrays;
-
-    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
-
-    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
-
-    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
-}
-
-// vertices -> vertices
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
-    unsigned const ilast = CONFIG_T::n_sublayers - 1;
-    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
-
-    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
-    garnet_utils::Means<last_layer_t> arrays_last;
-
-    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
-
-    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
-                                                                                              arrays_last);
-
-    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
-}
-
-// vertices -> out features
-template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-             res_T res[CONFIG_T::n_out_features]) {
-    #pragma HLS DATAFLOW
-
-    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
-    unsigned const ilast = CONFIG_T::n_sublayers - 1;
-    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
-
-    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
-    garnet_utils::Means<last_layer_t> arrays_last;
-
-    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
-
-    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
-                                                                                              arrays_last);
-
-    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
-
-    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
-}
-
-/* Reference (dumb) implementation returning (Vertices, Features) */
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
-garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
-    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
-    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
-
-    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-        if (iv == nvtx[0])
-            break;
-
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
-
-            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
-
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
-
-                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
-            }
-        }
-
-        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-
-            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
-
-            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
-                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
-                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
-
-                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
-            }
-
-            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
-        }
-    }
-
-    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
-
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-
-            aggregated_features[iap] = 0.;
-
-            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-                if (iv == nvtx[0])
-                    break;
-
-                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
-
-                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
-            }
-        }
-    }
-
-    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-
-            if (CONFIG_T::mean_by_nvert)
-                aggregated_features[iap] /= nvtx[0];
-            else {
-                // Not using right shift in case aggr_t is float or double
-                aggregated_features[iap] /= CONFIG_T::n_vertices;
-            }
-        }
-    }
-
-    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-        if (iv == nvtx[0])
-            break;
-
-        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-
-            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
-
-            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
-                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
-                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
-
-                typename CONFIG_T::aggr_t aggr = 0.;
-
-                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
-                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
-                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
-
-                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
-                }
-
-                acc += edge_weights[iva] * aggr;
-            }
-
-            res[ivo] = acc;
-        }
-    }
-}
-
-/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
-template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
-typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
-garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
-           res_T res[CONFIG_T::n_out_features]) {
-    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
-
-    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
-
-    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
-        typename CONFIG_T::aggr_t acc = 0.;
-
-        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
-            if (iv == nvtx[0])
-                break;
-
-            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
-
-            acc += vertex_res[ivo];
-        }
-
-        if (CONFIG_T::mean_by_nvert)
-            acc /= nvtx[0];
-        else {
-            // Not using right shift in case aggr_t is float or double
-            acc /= CONFIG_T::n_vertices;
-        }
-
-        res[io] = acc;
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_GARNET_H_
+#define NNET_GARNET_H_
+
+#include "hls_math.h"
+#include "hls_stream.h"
+#include "nnet_common.h"
+
+namespace nnet {
+namespace garnet_utils {
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+
+    index_t index;
+    typename CONFIG_T::distance_t distance;
+
+    // edge_weight_t is ap_ufixed with 0 iwidth -> let index 0 be a saturated version of 1
+    edge_weights_table[0] = ap_ufixed<CONFIG_T::edge_weight_t::width, 0, AP_TRN, AP_SAT>(1.);
+
+    for (unsigned iw = 1; iw < table_size; ++iw) {
+        index = iw;
+        distance.range(CONFIG_T::distance_width - 1, 0) = index.range(CONFIG_T::distance_width - 1, 0);
+        edge_weights_table[iw] = hls::exp(-distance * distance);
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value>::type
+initialize_edge_weights_table(typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    typename CONFIG_T::distance_t v = -32.;
+    for (unsigned iw = 0; iw < table_size; ++iw) {
+        edge_weights_table[iw] = std::exp(-v * v);
+        v += step;
+    }
+}
+
+template <class CONFIG_T>
+inline typename std::enable_if<std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    typedef ap_uint<CONFIG_T::distance_width> index_t;
+
+    index_t index(distance.range(CONFIG_T::distance_width - 1, 0));
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T>
+inline
+    typename std::enable_if<not std::is_class<typename CONFIG_T::distance_t>::value, typename CONFIG_T::edge_weight_t>::type
+    get_edge_weight(typename CONFIG_T::distance_t distance, typename CONFIG_T::edge_weight_t edge_weights_table[]) {
+    unsigned const table_size = (1 << CONFIG_T::distance_width);
+    double const step = 64. / table_size;
+
+    int index = (distance + 32.) / step;
+    if (index < 0)
+        index = 0;
+    else if (index >= table_size)
+        index = table_size - 1;
+
+    return edge_weights_table[index];
+}
+
+template <class CONFIG_T> typename CONFIG_T::edge_weight_t compute_edge_weight(typename CONFIG_T::distance_t distance) {
+    if (CONFIG_T::is_stack) {
+        #pragma HLS INLINE OFF
+    }
+#ifdef __SYNTHESIS__
+    typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    // unsigned const reshape_factor = CONFIG_T::n_aggregators * CONFIG_T::n_in_features * (CONFIG_T::n_vertices /
+    // CONFIG_T::reuse_factor);
+    // #pragma HLS ARRAY_RESHAPE variable=edge_weights_table cyclic factor=reshape_factor dim=1
+    bool initialized = false;
+#else
+    static typename CONFIG_T::edge_weight_t edge_weights_table[1 << CONFIG_T::distance_width];
+    static bool initialized = false;
+#endif
+    if (not initialized) {
+        initialize_edge_weights_table<CONFIG_T>(edge_weights_table);
+        initialized = true;
+    }
+
+    return get_edge_weight<CONFIG_T>(distance, edge_weights_table);
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                  exponent_T exponent) {
+    #pragma HLS INLINE
+    return dividend >> exponent;
+}
+
+template <class dividend_T, class exponent_T>
+inline typename std::enable_if<not std::is_class<dividend_T>::value, dividend_T>::type normalize_log2(dividend_T dividend,
+                                                                                                      exponent_T exponent) {
+    #pragma HLS INLINE
+    return dividend / std::pow(2., exponent);
+}
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct Means {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weight_mean[CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t weighted_feature_mean[CONFIG_T::n_aggregators * CONFIG_T::n_in_features];
+
+    Means() {
+        #pragma HLS INLINE
+        #pragma HLS ARRAY_PARTITION variable=edge_weight_mean complete
+        #pragma HLS ARRAY_PARTITION variable=weighted_feature_mean complete
+        #pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = 0.;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] = 0.;
+            }
+        }
+    }
+
+    void set_weight(unsigned, edge_weight_t const &) {
+        #pragma HLS INLINE
+    }
+
+    void add_means_normalized(Means<CONFIG_T, edge_weight_t> const &local) {
+        #pragma HLS INLINE
+        // Always called within a pipelined region - no UNROLL needed
+
+        unsigned const log2_unroll_factor = CONFIG_T::n_vertices_width - CONFIG_T::log2_reuse_factor;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] += normalize_log2(local.edge_weight_mean[ia], log2_unroll_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+                weighted_feature_mean[iax] += normalize_log2(local.weighted_feature_mean[iax], log2_unroll_factor);
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        #pragma HLS INLINE
+        #pragma HLS UNROLL region
+
+        // accum comes divided by unroll factor
+        typename T::norm_t nvtx_norm = (T::n_vertices / T::reuse_factor) / nvtx;
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+            edge_weight_mean[ia] = accum.edge_weight_mean[ia] * nvtx_norm;
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = accum.weighted_feature_mean[iax] * nvtx_norm;
+            }
+        }
+    }
+
+    template <class nvtx_T, class arrays_T, class T = CONFIG_T>
+    typename std::enable_if<not T::mean_by_nvert>::type set_means_normalized(nvtx_T const nvtx, arrays_T const &accum) {
+        #pragma HLS INLINE
+        #pragma HLS UNROLL region
+
+    Aggregators:
+        for (unsigned ia = 0; ia < T::n_aggregators; ++ia) {
+
+            edge_weight_mean[ia] = normalize_log2(accum.edge_weight_mean[ia], T::log2_reuse_factor);
+
+        InFeatures:
+            for (unsigned ix = 0; ix < T::n_in_features; ++ix) {
+                unsigned const iax = ia * T::n_in_features + ix;
+
+                weighted_feature_mean[iax] = normalize_log2(accum.weighted_feature_mean[iax], T::log2_reuse_factor);
+            }
+        }
+    }
+};
+
+template <class CONFIG_T, class E = typename CONFIG_T::edge_weight_t> struct WeightsAndMeans : public Means<CONFIG_T, E> {
+    typedef E edge_weight_t;
+
+    edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+
+    WeightsAndMeans() : Means<CONFIG_T, E>() {
+        #pragma HLS INLINE
+        unsigned const reshape_factor = CONFIG_T::n_aggregators * (CONFIG_T::n_vertices / CONFIG_T::reuse_factor);
+        #pragma HLS ARRAY_PARTITION variable=edge_weights cyclic factor=reshape_factor
+    }
+
+    void set_weight(unsigned iva, edge_weight_t const &weight) {
+        #pragma HLS INLINE
+        edge_weights[iva] = weight;
+    }
+};
+
+template <class CONFIG_T, class nvtx_T, class Enable = void> struct OutputBiasNormalizer;
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t const (&output_biases)[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const) : output_biases{CONFIG_T::output_transform_biases} {
+        #pragma HLS INLINE
+    }
+};
+
+template <class CONFIG_T, class nvtx_T>
+struct OutputBiasNormalizer<CONFIG_T, nvtx_T, typename std::enable_if<not CONFIG_T::mean_by_nvert>::type> {
+    typedef typename CONFIG_T::output_transform_biases_t biases_t;
+
+    biases_t output_biases[CONFIG_T::n_out_features];
+
+    OutputBiasNormalizer(nvtx_T const nvtx) {
+        #pragma HLS ARRAY_PARTITION variable=output_biases complete
+        #pragma HLS UNROLL region
+
+        // Cannot add a loop label here due to a Vivado HLS bug, apparently
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            typename CONFIG_T::aggr_t bias = CONFIG_T::output_transform_biases[io];
+            bias *= nvtx;
+            output_biases[io] = normalize_log2(bias, CONFIG_T::n_vertices_width);
+        }
+    }
+};
+
+template <class CONFIG_T, class data_T> struct InputDataGetter {
+    typedef data_T data_t;
+
+    data_T const *dataref;
+
+    InputDataGetter(data_T const *d) : dataref{d} {
+        #pragma HLS INLINE
+    }
+    data_T const &get(unsigned iv, unsigned ix) const {
+        #pragma HLS INLINE
+        unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+        return dataref[ivx];
+    }
+};
+
+template <class CONFIG_T, class data_T> struct SingleVertexDataGetter {
+    typedef data_T data_t;
+
+    data_T const (&dataref)[CONFIG_T::n_in_features];
+
+    SingleVertexDataGetter(data_T const (&d)[CONFIG_T::n_in_features]) : dataref{d} {
+        #pragma HLS INLINE
+    }
+    data_T const &get(unsigned, unsigned ix) const {
+        #pragma HLS INLINE
+        return dataref[ix];
+    }
+};
+
+template <class CONFIG_T, class res_T> struct OutputResSetter {
+    typedef res_T res_t;
+
+    res_T *resref;
+
+    OutputResSetter(res_T *r) : resref{r} {
+        #pragma HLS INLINE
+    }
+    void set(unsigned iv, unsigned io, res_T const &acc) {
+        #pragma HLS INLINE
+        unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+        resref[ivo] = acc;
+    }
+};
+
+template <class CONFIG_T, class res_T> struct SingleVertexResSetter {
+    typedef res_T res_t;
+
+    res_T (&resref)[CONFIG_T::n_out_features];
+
+    SingleVertexResSetter(res_T (&r)[CONFIG_T::n_out_features]) : resref{r} {
+        #pragma HLS INLINE
+    }
+    void set(unsigned, unsigned io, res_T const &acc) {
+        #pragma HLS INLINE
+        resref[io] = acc;
+    }
+};
+
+template <class CONFIG_T, class data_getter_T, class arrays_local_T, class arrays_T>
+inline void compute_weights_aggregates(data_getter_T const &data_getter, unsigned iv, arrays_local_T &arrays_local,
+                                       arrays_T &arrays) {
+    #pragma HLS INLINE
+
+Aggregators:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        typename CONFIG_T::distance_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+    InFeatures1:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename CONFIG_T::distance_t incr = data_getter.get(iv, ix) * CONFIG_T::aggregator_distance_weights[iax];
+
+            distance += incr;
+        }
+
+        typename CONFIG_T::edge_weight_t edge_weight =
+            garnet_utils::compute_edge_weight<typename CONFIG_T::base_t>(distance);
+
+        arrays_local.edge_weight_mean[ia] += edge_weight;
+
+    InFeatures2:
+        for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+            unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+            typename data_getter_T::data_t incr = data_getter.get(iv, ix) * edge_weight;
+
+            arrays_local.weighted_feature_mean[iax] += incr;
+        }
+
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+        arrays.set_weight(iva, edge_weight);
+    }
+}
+
+template <class CONFIG_T, class arrays_T>
+inline typename CONFIG_T::aggr_t compute_output_base_core(arrays_T const &arrays, unsigned io, unsigned ia) {
+    #pragma HLS INLINE
+    #pragma HLS UNROLL region
+
+    unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+    typename CONFIG_T::aggr_t aggr = arrays.edge_weight_mean[ia] * CONFIG_T::input_transform_biases[ioa];
+
+InFeatures:
+    for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+        unsigned const ioax = ioa * CONFIG_T::n_in_features + ix;
+        unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+        aggr += arrays.weighted_feature_mean[iax] * CONFIG_T::input_transform_weights[ioax];
+    }
+
+    return aggr;
+}
+
+template <class CONFIG_T, class arrays_T>
+inline void compute_output_base(arrays_T const &arrays,
+                                typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators]) {
+    #pragma HLS INLINE
+    #pragma HLS UNROLL region
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            output_base[ioa] = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+        }
+    }
+}
+
+template <class CONFIG_T, class arrays_T, class res_setter_T>
+inline void
+compute_vertex_output(arrays_T const &arrays, unsigned iv,
+                      typename CONFIG_T::aggr_t const output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators],
+                      res_setter_T &res_setter) {
+    #pragma HLS INLINE
+
+    typename arrays_T::edge_weight_t edge_weights[CONFIG_T::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=edge_weights complete
+
+Aggregators1:
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+        edge_weights[ia] = arrays.edge_weights[iva];
+    }
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename res_setter_T::res_t acc = CONFIG_T::output_transform_biases[io];
+
+    Aggregators2:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+            typename res_setter_T::res_t incr = edge_weights[ia] * output_base[ioa];
+            acc += incr;
+        }
+
+        res_setter.set(iv, io, acc);
+    }
+}
+
+template <class CONFIG_T, class data_T, class nvtx_T, class arrays_T>
+void aggregate(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx, arrays_T &arrays) {
+    InputDataGetter<CONFIG_T, data_T> data_getter(data);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+    Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<CONFIG_T, typename CONFIG_T::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_weights_aggregates<CONFIG_T>(data_getter, iv, means_local, arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class CONFIG_T, class nvtx_T, class arrays_T, class res_T>
+void distribute(nvtx_T const nvtx, arrays_T const &arrays, res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    OutputResSetter<CONFIG_T, res_T> res_setter(res);
+
+    typename CONFIG_T::aggr_t output_base[CONFIG_T::n_out_features * CONFIG_T::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=output_base complete
+
+    compute_output_base<CONFIG_T>(arrays, output_base);
+
+    unsigned const unroll_factor = CONFIG_T::n_vertices >> CONFIG_T::log2_reuse_factor;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < CONFIG_T::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            compute_vertex_output<CONFIG_T>(arrays, iv, output_base, res_setter);
+        }
+    }
+}
+
+template <class CONFIG_T, class output_biases_T, class arrays_T, class res_T>
+void set_output(output_biases_T const &output_transform_biases, arrays_T const &arrays,
+                res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS PIPELINE
+
+OutFeatures:
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        res_T acc = output_transform_biases.output_biases[io];
+
+    Aggregators:
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            typename CONFIG_T::aggr_t aggr = compute_output_base_core<CONFIG_T>(arrays, io, ia);
+
+            acc += arrays.edge_weight_mean[ia] * aggr;
+        }
+
+        res[io] = acc;
+    }
+}
+
+template <class prev_layer_t, class current_layer_t, class nvtx_T, class prev_arrays_T, class current_arrays_T>
+void distribute_aggregate(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, current_arrays_T &current_arrays) {
+    typedef typename prev_layer_t::output_t data_T;
+
+    typename prev_layer_t::aggr_t prev_output_base[prev_layer_t::n_out_features * prev_layer_t::n_aggregators];
+    #pragma HLS ARRAY_PARTITION variable=prev_output_base complete
+
+    compute_output_base<prev_layer_t>(prev_arrays, prev_output_base);
+
+    unsigned const unroll_factor = current_layer_t::n_vertices >> current_layer_t::log2_reuse_factor;
+
+    Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_accum;
+
+VerticesOuter:
+    for (unsigned ivv = 0; ivv < current_layer_t::reuse_factor; ++ivv) {
+        #pragma HLS PIPELINE
+
+        if (ivv * unroll_factor >= nvtx)
+            break;
+
+        Means<current_layer_t, typename current_layer_t::edge_weight_aggr_t> means_local;
+
+    VerticesInner:
+        for (unsigned ir = 0; ir < unroll_factor; ++ir) {
+            unsigned iv = ivv * unroll_factor + ir;
+
+            if (iv == nvtx)
+                break;
+
+            data_T data[prev_layer_t::n_out_features];
+            #pragma HLS ARRAY_PARTITION variable=data complete
+
+            SingleVertexResSetter<prev_layer_t, data_T> res_setter(data);
+
+            compute_vertex_output<prev_layer_t>(prev_arrays, iv, prev_output_base, res_setter);
+
+            SingleVertexDataGetter<current_layer_t, data_T> data_getter(data);
+
+            compute_weights_aggregates<current_layer_t>(data_getter, iv, means_local, current_arrays);
+        }
+
+        means_accum.add_means_normalized(means_local);
+    }
+
+    current_arrays.set_means_normalized(nvtx, means_accum);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    #pragma HLS INLINE
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, last_arrays);
+}
+
+template <class prev_layer_t, class current_layer_t, class last_layer_t, class nvtx_T, class prev_arrays_T,
+          class last_arrays_T>
+inline typename std::enable_if<not std::is_same<current_layer_t, last_layer_t>::value>::type
+sublayer(nvtx_T const nvtx, prev_arrays_T const &prev_arrays, last_arrays_T &last_arrays) {
+    #pragma HLS INLINE
+
+    WeightsAndMeans<current_layer_t> current_arrays;
+
+    distribute_aggregate<prev_layer_t, current_layer_t>(nvtx, prev_arrays, current_arrays);
+
+    sublayer<current_layer_t, typename current_layer_t::next_layer_t, last_layer_t>(nvtx, current_arrays, last_arrays);
+}
+} // namespace garnet_utils
+
+struct garnet_config {
+    // Layer specs
+    static const unsigned n_vertices_width = 8;
+    static const unsigned n_vertices = (1 << n_vertices_width);
+    static const unsigned n_in_features = 4;
+    static const unsigned n_propagate = 4;
+    static const unsigned n_aggregators = 4;
+    static const unsigned n_out_features = 4;
+    static const unsigned distance_width = 12;
+
+    // Internal data type definitions
+    typedef float input_transform_weights_t;
+    typedef float input_transform_biases_t;
+    typedef float output_transform_weights_t;
+    typedef float output_transform_biases_t;
+    typedef float aggregator_distance_weights_t;
+    typedef float aggregator_distance_biases_t;
+
+    typedef float norm_t;
+    typedef float distance_t;
+    typedef float edge_weight_t;
+    typedef float edge_weight_aggr_t;
+    typedef float aggr_t;
+    typedef float output_t;
+
+    /* static const input_transform_weights_t (&input_transform_weights)[n_out_features * n_aggregators * n_in_features]; */
+    /* static const input_transform_biases_t (&input_transform_biases)[n_out_features * n_aggregators]; */
+    /* static const aggregator_distance_weights_t (&aggregator_distance_weights)[n_aggregators * n_in_features]; */
+    /* static const aggregator_distance_biases_t (&aggregator_distance_biases)[n_aggregators]; */
+    /* static const output_transform_biases_t (&output_transform_biases)[n_out_features]; */
+
+    enum OutputCollapse { no_collapse, collapse_mean, collapse_max };
+
+    static const unsigned output_collapse = no_collapse;
+
+    static const bool mean_by_nvert = false;
+    static const bool is_stack = false;
+
+    // Optimization specs
+    static const unsigned reuse_factor = 64;
+    static const unsigned log2_reuse_factor = 6;
+};
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    garnet_utils::WeightsAndMeans<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::distribute<CONFIG_T>(nvtx[0], arrays, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+       res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    garnet_utils::Means<CONFIG_T> arrays;
+
+    garnet_utils::aggregate<CONFIG_T>(data, nvtx[0], arrays);
+
+    garnet_utils::OutputBiasNormalizer<CONFIG_T, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<CONFIG_T>(normalize_bias, arrays, res);
+}
+
+// vertices -> vertices
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::distribute<last_layer_t>(nvtx[0], arrays_last, res);
+}
+
+// vertices -> out features
+template <class data_T, class nvtx_T, class res_T, class CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_stack(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+             res_T res[CONFIG_T::n_out_features]) {
+    #pragma HLS DATAFLOW
+
+    typedef typename CONFIG_T::template sublayer_t<0> first_layer_t;
+    unsigned const ilast = CONFIG_T::n_sublayers - 1;
+    typedef typename CONFIG_T::template sublayer_t<ilast> last_layer_t;
+
+    garnet_utils::WeightsAndMeans<first_layer_t> arrays_first;
+    garnet_utils::Means<last_layer_t> arrays_last;
+
+    garnet_utils::aggregate<first_layer_t>(data, nvtx[0], arrays_first);
+
+    garnet_utils::sublayer<first_layer_t, typename first_layer_t::next_layer_t, last_layer_t>(nvtx[0], arrays_first,
+                                                                                              arrays_last);
+
+    garnet_utils::OutputBiasNormalizer<last_layer_t, nvtx_T> normalize_bias(nvtx[0]);
+
+    garnet_utils::set_output<last_layer_t>(normalize_bias, arrays_last, res);
+}
+
+/* Reference (dumb) implementation returning (Vertices, Features) */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::no_collapse>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_vertices * CONFIG_T::n_out_features]) {
+    typename CONFIG_T::edge_weight_t edge_weights[CONFIG_T::n_vertices * CONFIG_T::n_aggregators];
+    typename CONFIG_T::aggr_t propagated_features[CONFIG_T::n_vertices * CONFIG_T::n_propagate];
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+            propagated_features[ivp] = CONFIG_T::input_transform_biases[ip];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const ipx = ip * CONFIG_T::n_in_features + ix;
+
+                propagated_features[ivp] += data[ivx] * CONFIG_T::input_transform_weights[ipx];
+            }
+        }
+
+        for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+            unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+
+            typename CONFIG_T::aggr_t distance = CONFIG_T::aggregator_distance_biases[ia];
+
+            for (unsigned ix = 0; ix < CONFIG_T::n_in_features; ++ix) {
+                unsigned const ivx = iv * CONFIG_T::n_in_features + ix;
+                unsigned const iax = ia * CONFIG_T::n_in_features + ix;
+
+                distance += data[ivx] * CONFIG_T::aggregator_distance_weights[iax];
+            }
+
+            edge_weights[iva] = garnet_utils::compute_edge_weight<CONFIG_T>(distance);
+        }
+    }
+
+    typename CONFIG_T::aggr_t aggregated_features[CONFIG_T::n_aggregators * CONFIG_T::n_propagate];
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            aggregated_features[iap] = 0.;
+
+            for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+                if (iv == nvtx[0])
+                    break;
+
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ivp = iv * CONFIG_T::n_propagate + ip;
+
+                aggregated_features[iap] += edge_weights[iva] * propagated_features[ivp];
+            }
+        }
+    }
+
+    for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+        for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+            unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+
+            if (CONFIG_T::mean_by_nvert)
+                aggregated_features[iap] /= nvtx[0];
+            else {
+                // Not using right shift in case aggr_t is float or double
+                aggregated_features[iap] /= CONFIG_T::n_vertices;
+            }
+        }
+    }
+
+    for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+        if (iv == nvtx[0])
+            break;
+
+        for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            typename CONFIG_T::aggr_t acc = CONFIG_T::output_transform_biases[io];
+
+            for (unsigned ia = 0; ia < CONFIG_T::n_aggregators; ++ia) {
+                unsigned const iva = iv * CONFIG_T::n_aggregators + ia;
+                unsigned const ioa = io * CONFIG_T::n_aggregators + ia;
+
+                typename CONFIG_T::aggr_t aggr = 0.;
+
+                for (unsigned ip = 0; ip < CONFIG_T::n_propagate; ++ip) {
+                    unsigned const iap = ia * CONFIG_T::n_propagate + ip;
+                    unsigned const ioap = ioa * CONFIG_T::n_propagate + ip;
+
+                    aggr += CONFIG_T::output_transform_weights[ioap] * aggregated_features[iap];
+                }
+
+                acc += edge_weights[iva] * aggr;
+            }
+
+            res[ivo] = acc;
+        }
+    }
+}
+
+/* Reference (dumb) implementation returning (Features) - output averaged over vertices already */
+template <class data_T, class nvtx_T, class res_T, typename CONFIG_T>
+typename std::enable_if<CONFIG_T::output_collapse == CONFIG_T::collapse_mean>::type
+garnet_ref(data_T const data[CONFIG_T::n_vertices * CONFIG_T::n_in_features], nvtx_T const nvtx[1],
+           res_T res[CONFIG_T::n_out_features]) {
+    typename CONFIG_T::aggr_t vertex_res[CONFIG_T::n_vertices * CONFIG_T::n_out_features];
+
+    garnet_ref<CONFIG_T>(data, nvtx, vertex_res);
+
+    for (unsigned io = 0; io < CONFIG_T::n_out_features; ++io) {
+        typename CONFIG_T::aggr_t acc = 0.;
+
+        for (unsigned iv = 0; iv < CONFIG_T::n_vertices; ++iv) {
+            if (iv == nvtx[0])
+                break;
+
+            unsigned const ivo = iv * CONFIG_T::n_out_features + io;
+
+            acc += vertex_res[ivo];
+        }
+
+        if (CONFIG_T::mean_by_nvert)
+            acc /= nvtx[0];
+        else {
+            // Not using right shift in case aggr_t is float or double
+            acc /= CONFIG_T::n_vertices;
+        }
+
+        res[io] = acc;
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
index 1a3a3d28b5..b8c2a48d19 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_helpers.h
@@ -1,382 +1,382 @@
-#ifndef NNET_HELPERS_H
-#define NNET_HELPERS_H
-
-#include "hls_stream.h"
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
-namespace nnet {
-
-#ifndef __SYNTHESIS__
-
-#ifndef WEIGHTS_DIR
-#define WEIGHTS_DIR "weights"
-#endif
-
-template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-
-        size_t i = 0;
-        while (std::getline(iss, token, ',')) {
-            std::istringstream(token) >> w[i];
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-
-template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-        std::string extra_chars = "} ";
-
-        size_t i = 0;
-        while (std::getline(iss, token, '{')) {
-            if (token.length() == 0) {
-                continue;
-            }
-            for (char c : extra_chars) {
-                token.erase(std::remove(token.begin(), token.end(), c), token.end());
-            }
-            if (token.back() == ',') {
-                token.erase(token.end() - 1);
-            }
-
-            std::replace(token.begin(), token.end(), ',', ' ');
-            std::istringstream structss(token);
-
-            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
-                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
-                exit(1);
-            }
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-
-template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
-
-    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
-    std::ifstream infile(full_path.c_str(), std::ios::binary);
-
-    if (infile.fail()) {
-        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
-        exit(1);
-    }
-
-    std::string line;
-    if (std::getline(infile, line)) {
-        std::istringstream iss(line);
-        std::string token;
-        std::string extra_chars = "} ";
-
-        size_t i = 0;
-        while (std::getline(iss, token, '{')) {
-            if (token.length() == 0) {
-                continue;
-            }
-            for (char c : extra_chars) {
-                token.erase(std::remove(token.begin(), token.end(), c), token.end());
-            }
-            if (token.back() == ',') {
-                token.erase(token.end() - 1);
-            }
-
-            std::replace(token.begin(), token.end(), ',', ' ');
-            std::istringstream structss(token);
-
-            if (!(structss >> w[i].sign >> w[i].weight)) {
-                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
-                exit(1);
-            }
-            i++;
-        }
-
-        if (SIZE != i) {
-            std::cerr << "ERROR: Expected " << SIZE << " values";
-            std::cerr << " but read only " << i << " values" << std::endl;
-        }
-    }
-}
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
-    for (size_t i = 0; i < SIZE; i++) {
-        dst[i] = dstType(src[i]);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<dstType> &dst) {
-    for (size_t i = 0; i < SIZE / dstType::size; i++) {
-        dstType ctype;
-        for (size_t j = 0; j < dstType::size; j++) {
-            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
-        }
-        dst.write(ctype);
-    }
-}
-
-template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stream<srcType> &src, dstType *dst) {
-    for (size_t i = 0; i < SIZE / srcType::size; i++) {
-        srcType ctype = src.read();
-        for (size_t j = 0; j < srcType::size; j++) {
-            dst[i * srcType::size + j] = dstType(ctype[j]);
-        }
-    }
-}
-
-extern bool trace_enabled;
-extern std::map<std::string, void *> *trace_outputs;
-extern size_t trace_type_size;
-
-template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
-    for (int i = 0; i < layer_size; i++) {
-        ptr[i] = save_T(data[i]);
-    }
-}
-
-template <class data_T, class save_T> void save_output_array(hls::stream<data_T> &data, save_T *ptr, size_t layer_size) {
-    for (size_t i = 0; i < layer_size / data_T::size; i++) {
-        data_T ctype = data.read();
-        for (size_t j = 0; j < data_T::size; j++) {
-            ptr[i * data_T::size + j] = save_T(ctype[j]);
-        }
-        data.write(ctype);
-    }
-}
-
-// We don't want to include save_T in this function because it will be inserted into myproject.cpp
-// so a workaround with element size is used
-template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (int i = 0; i < layer_size; i++) {
-            out << float(data[i]) << " "; // We don't care about precision in text files
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-template <class data_T> void save_layer_output(hls::stream<data_T> &data, const char *layer_name, size_t layer_size) {
-    if (!trace_enabled)
-        return;
-
-    if (trace_outputs) {
-        if (trace_outputs->count(layer_name) > 0) {
-            if (trace_type_size == 4) {
-                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
-            } else if (trace_type_size == 8) {
-                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
-            } else {
-                std::cout << "Unknown trace type!" << std::endl;
-            }
-        } else {
-            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
-        }
-    } else {
-        std::ostringstream filename;
-        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
-        std::fstream out;
-        out.open(filename.str(), std::ios::app);
-        assert(out.is_open());
-        for (size_t i = 0; i < layer_size / data_T::size; i++) {
-            data_T ctype = data.read();
-            for (size_t j = 0; j < data_T::size; j++) {
-                out << float(ctype[j]) << " "; // We don't care about precision in text files
-            }
-            data.write(ctype);
-        }
-        out << std::endl;
-        out.close();
-    }
-}
-
-#endif
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
-    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
-    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
-    std::copy(in_begin, in_end, dst);
-}
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
-void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
-    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
-    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
-
-    size_t i_pack = 0;
-    dst_T dst_pack;
-    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
-        dst_pack[i_pack++] = typename dst_T::value_type(*i);
-        if (i_pack == dst_T::size) {
-            i_pack = 0;
-            dst.write(dst_pack);
-        }
-    }
-}
-
-template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
-    for (auto i = 0; i < SIZE; i++)
-        if (i == SIZE - 1) {
-            dst[i].data = src[i];
-            dst[i].last = 1;
-        } else {
-            dst[i].data = src[i];
-            dst[i].last = 0;
-        }
-}
-
-template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
-    for (int i = 0; i < SIZE; i++) {
-        out << result[i] << " ";
-    }
-    out << std::endl;
-}
-
-template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
-    for (int i = 0; i < SIZE / res_T::size; i++) {
-        res_T res_pack = result.read();
-        for (int j = 0; j < res_T::size; j++) {
-            out << res_pack[j] << " ";
-        }
-        if (keep)
-            result.write(res_pack);
-    }
-    out << std::endl;
-}
-
-template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
-
-template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
-    for (int i = 0; i < SIZE / data_T::size; i++) {
-        data_T data_pack;
-        for (int j = 0; j < data_T::size; j++) {
-            data_pack[j] = 0.;
-        }
-        data.write(data_pack);
-    }
-}
-
-template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
-    FILE *fp;
-    fp = fopen(filename, "r");
-    if (fp == 0) {
-        return -1;
-    }
-    // Read data from file
-    float newval;
-    for (int ii = 0; ii < nrows; ii++) {
-        if (fscanf(fp, "%f\n", &newval) != 0) {
-            data[ii] = newval;
-        } else {
-            return -2;
-        }
-    }
-    fclose(fp);
-    return 0;
-}
-
-template <class dataType, unsigned int nrows, unsigned int ncols>
-int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
-    FILE *fp;
-    fp = fopen(filename, "r");
-    if (fp == 0) {
-        return -1;
-    }
-    // Read data from file
-    float newval;
-    for (int ii = 0; ii < nrows; ii++) {
-        for (int jj = 0; jj < ncols; jj++) {
-            if (fscanf(fp, "%f\n", &newval) != 0) {
-                data[ii][jj] = newval;
-            } else {
-                return -2;
-            }
-        }
-    }
-    fclose(fp);
-    return 0;
-}
-
-template <class in_T, class out_T, int N_IN> void change_type(hls::stream<in_T> &in, hls::stream<out_T> &out) {
-    in_T datareg;
-    hls::stream<out_T> input_trunc;
-    for (int ii = 0; ii < N_IN; ii++) {
-        out << (out_T)in.read();
-    }
-}
-
-template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &data, hls::stream<data_T> &res) {
-    data_T datareg;
-    for (int ii = 0; ii < N_IN; ii++) {
-        datareg = data.read();
-        std::cout << "[" << ii << "]: " << datareg << std::endl;
-        res << datareg;
-    }
-}
-
-constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
-
-constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
-
-constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_HELPERS_H
+#define NNET_HELPERS_H
+
+#include "hls_stream.h"
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+namespace nnet {
+
+#ifndef __SYNTHESIS__
+
+#ifndef WEIGHTS_DIR
+#define WEIGHTS_DIR "weights"
+#endif
+
+template <class T, size_t SIZE> void load_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+
+        size_t i = 0;
+        while (std::getline(iss, token, ',')) {
+            std::istringstream(token) >> w[i];
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_compressed_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].row_index >> w[i].col_index >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+
+template <class T, size_t SIZE> void load_exponent_weights_from_txt(T *w, const char *fname) {
+
+    std::string full_path = std::string(WEIGHTS_DIR) + "/" + std::string(fname);
+    std::ifstream infile(full_path.c_str(), std::ios::binary);
+
+    if (infile.fail()) {
+        std::cerr << "ERROR: file " << std::string(fname) << " does not exist" << std::endl;
+        exit(1);
+    }
+
+    std::string line;
+    if (std::getline(infile, line)) {
+        std::istringstream iss(line);
+        std::string token;
+        std::string extra_chars = "} ";
+
+        size_t i = 0;
+        while (std::getline(iss, token, '{')) {
+            if (token.length() == 0) {
+                continue;
+            }
+            for (char c : extra_chars) {
+                token.erase(std::remove(token.begin(), token.end(), c), token.end());
+            }
+            if (token.back() == ',') {
+                token.erase(token.end() - 1);
+            }
+
+            std::replace(token.begin(), token.end(), ',', ' ');
+            std::istringstream structss(token);
+
+            if (!(structss >> w[i].sign >> w[i].weight)) {
+                std::cerr << "ERROR: Unable to parse file " << std::string(fname);
+                exit(1);
+            }
+            i++;
+        }
+
+        if (SIZE != i) {
+            std::cerr << "ERROR: Expected " << SIZE << " values";
+            std::cerr << " but read only " << i << " values" << std::endl;
+        }
+    }
+}
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, dstType *dst) {
+    for (size_t i = 0; i < SIZE; i++) {
+        dst[i] = dstType(src[i]);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(srcType *src, hls::stream<dstType> &dst) {
+    for (size_t i = 0; i < SIZE / dstType::size; i++) {
+        dstType ctype;
+        for (size_t j = 0; j < dstType::size; j++) {
+            ctype[j] = typename dstType::value_type(src[i * dstType::size + j]);
+        }
+        dst.write(ctype);
+    }
+}
+
+template <class srcType, class dstType, size_t SIZE> void convert_data(hls::stream<srcType> &src, dstType *dst) {
+    for (size_t i = 0; i < SIZE / srcType::size; i++) {
+        srcType ctype = src.read();
+        for (size_t j = 0; j < srcType::size; j++) {
+            dst[i * srcType::size + j] = dstType(ctype[j]);
+        }
+    }
+}
+
+extern bool trace_enabled;
+extern std::map<std::string, void *> *trace_outputs;
+extern size_t trace_type_size;
+
+template <class data_T, class save_T> void save_output_array(data_T *data, save_T *ptr, size_t layer_size) {
+    for (int i = 0; i < layer_size; i++) {
+        ptr[i] = save_T(data[i]);
+    }
+}
+
+template <class data_T, class save_T> void save_output_array(hls::stream<data_T> &data, save_T *ptr, size_t layer_size) {
+    for (size_t i = 0; i < layer_size / data_T::size; i++) {
+        data_T ctype = data.read();
+        for (size_t j = 0; j < data_T::size; j++) {
+            ptr[i * data_T::size + j] = save_T(ctype[j]);
+        }
+        data.write(ctype);
+    }
+}
+
+// We don't want to include save_T in this function because it will be inserted into myproject.cpp
+// so a workaround with element size is used
+template <class data_T> void save_layer_output(data_T *data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (int i = 0; i < layer_size; i++) {
+            out << float(data[i]) << " "; // We don't care about precision in text files
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+template <class data_T> void save_layer_output(hls::stream<data_T> &data, const char *layer_name, size_t layer_size) {
+    if (!trace_enabled)
+        return;
+
+    if (trace_outputs) {
+        if (trace_outputs->count(layer_name) > 0) {
+            if (trace_type_size == 4) {
+                save_output_array<data_T, float>(data, (float *)(*trace_outputs)[layer_name], layer_size);
+            } else if (trace_type_size == 8) {
+                save_output_array<data_T, double>(data, (double *)(*trace_outputs)[layer_name], layer_size);
+            } else {
+                std::cout << "Unknown trace type!" << std::endl;
+            }
+        } else {
+            std::cout << "Layer name: " << layer_name << " not found in debug storage!" << std::endl;
+        }
+    } else {
+        std::ostringstream filename;
+        filename << "./tb_data/" << layer_name << "_output.log"; // TODO if run as a shared lib, path should be ../tb_data
+        std::fstream out;
+        out.open(filename.str(), std::ios::app);
+        assert(out.is_open());
+        for (size_t i = 0; i < layer_size / data_T::size; i++) {
+            data_T ctype = data.read();
+            for (size_t j = 0; j < data_T::size; j++) {
+                out << float(ctype[j]) << " "; // We don't care about precision in text files
+            }
+            data.write(ctype);
+        }
+        out << std::endl;
+        out.close();
+    }
+}
+
+#endif
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data(std::vector<src_T> src, dst_T dst[SIZE]) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+    std::copy(in_begin, in_end, dst);
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE>
+void copy_data(std::vector<src_T> src, hls::stream<dst_T> &dst) {
+    typename std::vector<src_T>::const_iterator in_begin = src.cbegin() + OFFSET;
+    typename std::vector<src_T>::const_iterator in_end = in_begin + SIZE;
+
+    size_t i_pack = 0;
+    dst_T dst_pack;
+    for (typename std::vector<src_T>::const_iterator i = in_begin; i != in_end; ++i) {
+        dst_pack[i_pack++] = typename dst_T::value_type(*i);
+        if (i_pack == dst_T::size) {
+            i_pack = 0;
+            dst.write(dst_pack);
+        }
+    }
+}
+
+template <class src_T, class dst_T, size_t OFFSET, size_t SIZE> void copy_data_axi(std::vector<src_T> src, dst_T dst[SIZE]) {
+    for (auto i = 0; i < SIZE; i++)
+        if (i == SIZE - 1) {
+            dst[i].data = src[i];
+            dst[i].last = 1;
+        } else {
+            dst[i].data = src[i];
+            dst[i].last = 0;
+        }
+}
+
+template <class res_T, size_t SIZE> void print_result(res_T result[SIZE], std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE; i++) {
+        out << result[i] << " ";
+    }
+    out << std::endl;
+}
+
+template <class res_T, size_t SIZE> void print_result(hls::stream<res_T> &result, std::ostream &out, bool keep = false) {
+    for (int i = 0; i < SIZE / res_T::size; i++) {
+        res_T res_pack = result.read();
+        for (int j = 0; j < res_T::size; j++) {
+            out << res_pack[j] << " ";
+        }
+        if (keep)
+            result.write(res_pack);
+    }
+    out << std::endl;
+}
+
+template <class data_T, size_t SIZE> void fill_zero(data_T data[SIZE]) { std::fill_n(data, SIZE, 0.); }
+
+template <class data_T, size_t SIZE> void fill_zero(hls::stream<data_T> &data) {
+    for (int i = 0; i < SIZE / data_T::size; i++) {
+        data_T data_pack;
+        for (int j = 0; j < data_T::size; j++) {
+            data_pack[j] = 0.;
+        }
+        data.write(data_pack);
+    }
+}
+
+template <class dataType, unsigned int nrows> int read_file_1D(const char *filename, dataType data[nrows]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        if (fscanf(fp, "%f\n", &newval) != 0) {
+            data[ii] = newval;
+        } else {
+            return -2;
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class dataType, unsigned int nrows, unsigned int ncols>
+int read_file_2D(const char *filename, dataType data[nrows][ncols]) {
+    FILE *fp;
+    fp = fopen(filename, "r");
+    if (fp == 0) {
+        return -1;
+    }
+    // Read data from file
+    float newval;
+    for (int ii = 0; ii < nrows; ii++) {
+        for (int jj = 0; jj < ncols; jj++) {
+            if (fscanf(fp, "%f\n", &newval) != 0) {
+                data[ii][jj] = newval;
+            } else {
+                return -2;
+            }
+        }
+    }
+    fclose(fp);
+    return 0;
+}
+
+template <class in_T, class out_T, int N_IN> void change_type(hls::stream<in_T> &in, hls::stream<out_T> &out) {
+    in_T datareg;
+    hls::stream<out_T> input_trunc;
+    for (int ii = 0; ii < N_IN; ii++) {
+        out << (out_T)in.read();
+    }
+}
+
+template <class data_T, int N_IN> void hls_stream_debug(hls::stream<data_T> &data, hls::stream<data_T> &res) {
+    data_T datareg;
+    for (int ii = 0; ii < N_IN; ii++) {
+        datareg = data.read();
+        std::cout << "[" << ii << "]: " << datareg << std::endl;
+        res << datareg;
+    }
+}
+
+constexpr int ceillog2(int x) { return (x <= 2) ? 1 : 1 + ceillog2((x + 1) / 2); }
+
+constexpr int floorlog2(int x) { return (x < 2) ? 0 : 1 + floorlog2(x / 2); }
+
+constexpr int pow2(int x) { return x == 0 ? 1 : 2 * pow2(x - 1); }
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
index e0c5cb4e27..8005682978 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge.h
@@ -1,256 +1,256 @@
-#ifndef NNET_MERGE_H_
-#define NNET_MERGE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-struct merge_config {
-    static const unsigned n_elem = 10;
-};
-
-struct dot_config {
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 1;
-    static const unsigned reuse_factor = 1;
-    typedef float accum_t;
-    // Product function to use
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-struct concat_config {
-    static const unsigned n_elem1_0 = 10;
-    static const unsigned n_elem1_1 = 10;
-    static const unsigned n_elem1_2 = 10;
-    static const unsigned n_elem2_0 = 10;
-    static const unsigned n_elem2_1 = 10;
-    static const unsigned n_elem2_2 = 10;
-
-    static const unsigned axis = -1;
-};
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] + data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] - data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = data1[ii] * data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
-        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
-
-    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
-    #pragma HLS ARRAY_PARTITION variable=mult complete
-    typename CONFIG_T::accum_t acc = 0;
-
-Product:
-    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
-        #pragma HLS UNROLL
-        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
-    }
-
-Accum:
-    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
-        #pragma HLS UNROLL
-        acc += mult[i_acc];
-    }
-
-    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
-                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
-        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
-        }
-        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
-            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
-                data2[ii * CONFIG_T::n_elem2_1 + jj];
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
-    #pragma HLS INLINE
-
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
-        res[ii] = data1[ii];
-    }
-    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
-        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx =
-                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                res[res_idx] = data1[data_idx];
-            }
-        }
-        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
-                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
-                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
-                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
-                res[res_idx] = data2[data_idx];
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS PIPELINE
-
-    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
-        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
-                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
-                res[res_idx] = data1[data_idx];
-            }
-            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
-                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
-                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
-                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
-                res[res_idx] = data2[data_idx];
-            }
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
-                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
-                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
-                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
-    #pragma HLS INLINE
-
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MERGE_H_
+#define NNET_MERGE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct merge_config {
+    static const unsigned n_elem = 10;
+};
+
+struct dot_config {
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 1;
+    static const unsigned reuse_factor = 1;
+    typedef float accum_t;
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+struct concat_config {
+    static const unsigned n_elem1_0 = 10;
+    static const unsigned n_elem1_1 = 10;
+    static const unsigned n_elem1_2 = 10;
+    static const unsigned n_elem2_0 = 10;
+    static const unsigned n_elem2_1 = 10;
+    static const unsigned n_elem2_2 = 10;
+
+    static const unsigned axis = -1;
+};
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] + data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] - data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = data1[ii] * data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] + data2[ii]) / (res_T)2;
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] > data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(input1_T data1[CONFIG_T::n_elem], input2_T data2[CONFIG_T::n_elem], res_T res[CONFIG_T::n_elem]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem; ii++) {
+        res[ii] = (data1[ii] < data2[ii]) ? data1[ii] : data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void dot1d(input1_T data1[CONFIG_T::n_in], input2_T data2[CONFIG_T::n_in], res_T res[CONFIG_T::n_out]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    #pragma HLS ALLOCATION operation instances=mul limit=CONFIG_T::multiplier_limit
+
+    typename CONFIG_T::accum_t mult[CONFIG_T::n_in];
+    #pragma HLS ARRAY_PARTITION variable=mult complete
+    typename CONFIG_T::accum_t acc = 0;
+
+Product:
+    for (int i_mult = 0; i_mult < CONFIG_T::n_in; i_mult++) {
+        #pragma HLS UNROLL
+        mult[i_mult] = CONFIG_T::template product<input1_T, input2_T>::product(data1[i_mult], data2[i_mult]);
+    }
+
+Accum:
+    for (int i_acc = 0; i_acc < CONFIG_T::n_in; i_acc++) {
+        #pragma HLS UNROLL
+        acc += mult[i_acc];
+    }
+
+    res[0] = cast<input1_T, res_T, CONFIG_T>(acc);
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(input1_T data1[CONFIG_T::n_elem1_0], input2_T data2[CONFIG_T::n_elem2_0],
+                   res_T res[CONFIG_T::n_elem1_0 + CONFIG_T::n_elem2_0]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0; ii++) {
+        res[CONFIG_T::n_elem1_0 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + jj] = data1[ii * CONFIG_T::n_elem1_1 + jj];
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            res[ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) + CONFIG_T::n_elem1_1 + jj] =
+                data2[ii * CONFIG_T::n_elem2_1 + jj];
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1]) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2; ii++) {
+        res[ii] = data1[ii];
+    }
+    for (int ii = 0; ii < CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2; ii++) {
+        res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + ii] = data2[ii];
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx =
+                    ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+        }
+        for (int jj = 0; jj < CONFIG_T::n_elem2_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem2_2; kk++) {
+                int res_idx = ii * (CONFIG_T::n_elem1_1 + CONFIG_T::n_elem2_1) * CONFIG_T::n_elem1_2 +
+                              (jj + CONFIG_T::n_elem1_1) * CONFIG_T::n_elem1_2 + kk;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                     input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                     res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                               CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS PIPELINE
+
+    for (int ii = 0; ii < CONFIG_T::n_elem1_0; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_elem1_1; jj++) {
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk;
+                int data_idx = ii * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 + jj * CONFIG_T::n_elem1_2 + kk;
+                res[res_idx] = data1[data_idx];
+            }
+            for (int kk = 0; kk < CONFIG_T::n_elem1_2; kk++) {
+                int res_idx = ii * CONFIG_T::n_elem1_1 * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) +
+                              jj * (CONFIG_T::n_elem1_2 + CONFIG_T::n_elem2_2) + kk + CONFIG_T::n_elem1_2;
+                int data_idx = ii * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2 + jj * CONFIG_T::n_elem2_2 + kk;
+                res[res_idx] = data2[data_idx];
+            }
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(input1_T data1[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2],
+                   input2_T data2[CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2],
+                   res_T res[CONFIG_T::n_elem1_0 * CONFIG_T::n_elem1_1 * CONFIG_T::n_elem1_2 +
+                             CONFIG_T::n_elem2_0 * CONFIG_T::n_elem2_1 * CONFIG_T::n_elem2_2]) {
+    #pragma HLS INLINE
+
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
index 17cf4fe99c..a57ec78e35 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_merge_stream.h
@@ -1,370 +1,370 @@
-#ifndef NNET_MERGE_STREAM_H_
-#define NNET_MERGE_STREAM_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-AddLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    AddPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = in_data1[j] + in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-SubtractLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    SubtractPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = in_data1[j] - in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MultiplyLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MultiplyPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = in_data1[j] * in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-AverageLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    AveragePack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MaximumLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MaximumPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
-
-MinimumLoop:
-    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    MinimumPack:
-        for (int j = 0; j < res_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth1:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-ConcatLoopHeight2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-    ConcatLoopWidth2:
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth1:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-            res.write(out_data);
-        }
-    ConcatLoopWidth2:
-        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-    ConcatLoopWidth:
-        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
-            #pragma HLS PIPELINE II=1
-
-            input1_T in_data1 = data1.read();
-            input2_T in_data2 = data2.read();
-            res_T out_data;
-            PRAGMA_DATA_PACK(out_data)
-
-        ConcatPackInput1:
-            for (int k = 0; k < input1_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[k] = in_data1[k];
-            }
-
-        ConcatPackInput2:
-            for (int k = 0; k < input2_T::size; k++) {
-                #pragma HLS UNROLL
-                out_data[input1_T::size + k] = in_data2[k];
-            }
-
-            res.write(out_data);
-        }
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate3d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
-        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
-        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        #pragma HLS PIPELINE II=1
-
-        input1_T in_data1 = data1.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput1:
-        for (int k = 0; k < input1_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[k] = in_data1[k];
-        }
-
-        res.write(out_data);
-    }
-ConcatLoopHeight2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
-        #pragma HLS PIPELINE II=1
-
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput2:
-        for (int k = 0; k < input2_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[k] = in_data2[k];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-ConcatLoopHeight:
-    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
-        #pragma HLS PIPELINE II=1
-
-        input1_T in_data1 = data1.read();
-        input2_T in_data2 = data2.read();
-        res_T out_data;
-        PRAGMA_DATA_PACK(out_data)
-
-    ConcatPackInput1:
-        for (int k = 0; k < input1_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[k] = in_data1[k];
-        }
-
-    ConcatPackInput2:
-        for (int k = 0; k < input2_T::size; k++) {
-            #pragma HLS UNROLL
-            out_data[input1_T::size + k] = in_data2[k];
-        }
-
-        res.write(out_data);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
-        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    } else {
-        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
-    }
-}
-
-template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
-void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
-    res_T out_data;
-    PRAGMA_DATA_PACK(out_data)
-ConcatLoop1:
-    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
-        #pragma HLS PIPELINE
-        input1_T in_data1 = data1.read();
-    ConcatPack1:
-        for (int j = 0; j < input1_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j + (i * input1_T::size)] = in_data1[j];
-        }
-    }
-ConcatLoop2:
-    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
-        #pragma HLS PIPELINE
-        input2_T in_data2 = data2.read();
-    ConcatPack2:
-        for (int j = 0; j < input2_T::size; j++) {
-            #pragma HLS UNROLL
-            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j];
-        }
-    }
-    res.write(out_data);
-}
-} // namespace nnet
-
-#endif
+#ifndef NNET_MERGE_STREAM_H_
+#define NNET_MERGE_STREAM_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include <math.h>
+
+namespace nnet {
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void add(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AddLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AddPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] + in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void subtract(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+SubtractLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    SubtractPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] - in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void multiply(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MultiplyLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MultiplyPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = in_data1[j] * in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void average(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+AverageLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    AveragePack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] + in_data2[j]) / (typename res_T::value_type)2;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void maximum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MaximumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MaximumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] > in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void minimum(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    assert(input1_T::size == input2_T::size && input1_T::size == res_T::size);
+
+MinimumLoop:
+    for (int i = 0; i < CONFIG_T::n_elem / input1_T::size; i++) {
+        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    MinimumPack:
+        for (int j = 0; j < res_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j] = (in_data1[j] < in_data2[j]) ? in_data1[j] : in_data2[j];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth1:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+            res.write(out_data);
+        }
+    ConcatLoopWidth2:
+        for (int j = 0; j < CONFIG_T::n_elem2_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d_2(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+    ConcatLoopWidth:
+        for (int j = 0; j < CONFIG_T::n_elem1_1; j++) {
+            #pragma HLS PIPELINE II=1
+
+            input1_T in_data1 = data1.read();
+            input2_T in_data2 = data2.read();
+            res_T out_data;
+            PRAGMA_DATA_PACK(out_data)
+
+        ConcatPackInput1:
+            for (int k = 0; k < input1_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[k] = in_data1[k];
+            }
+
+        ConcatPackInput2:
+            for (int k = 0; k < input2_T::size; k++) {
+                #pragma HLS UNROLL
+                out_data[input1_T::size + k] = in_data2[k];
+            }
+
+            res.write(out_data);
+        }
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate3d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 3 || CONFIG_T::axis == -1) {
+        concatenate3d_2<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else if (CONFIG_T::axis == 2 || CONFIG_T::axis == -2) {
+        concatenate3d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate3d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_0(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+        res.write(out_data);
+    }
+ConcatLoopHeight2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d_1(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+ConcatLoopHeight:
+    for (int i = 0; i < CONFIG_T::n_elem1_0; i++) {
+        #pragma HLS PIPELINE II=1
+
+        input1_T in_data1 = data1.read();
+        input2_T in_data2 = data2.read();
+        res_T out_data;
+        PRAGMA_DATA_PACK(out_data)
+
+    ConcatPackInput1:
+        for (int k = 0; k < input1_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[k] = in_data1[k];
+        }
+
+    ConcatPackInput2:
+        for (int k = 0; k < input2_T::size; k++) {
+            #pragma HLS UNROLL
+            out_data[input1_T::size + k] = in_data2[k];
+        }
+
+        res.write(out_data);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate2d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    if (CONFIG_T::axis == 2 || CONFIG_T::axis == -1) {
+        concatenate2d_1<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    } else {
+        concatenate2d_0<input1_T, input2_T, res_T, CONFIG_T>(data1, data2, res);
+    }
+}
+
+template <class input1_T, class input2_T, class res_T, typename CONFIG_T>
+void concatenate1d(hls::stream<input1_T> &data1, hls::stream<input2_T> &data2, hls::stream<res_T> &res) {
+    res_T out_data;
+    PRAGMA_DATA_PACK(out_data)
+ConcatLoop1:
+    for (int i = 0; i < CONFIG_T::n_elem1_0 / input1_T::size; i++) {
+        #pragma HLS PIPELINE
+        input1_T in_data1 = data1.read();
+    ConcatPack1:
+        for (int j = 0; j < input1_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j + (i * input1_T::size)] = in_data1[j];
+        }
+    }
+ConcatLoop2:
+    for (int i = 0; i < CONFIG_T::n_elem2_0 / input2_T::size; i++) {
+        #pragma HLS PIPELINE
+        input2_T in_data2 = data2.read();
+    ConcatPack2:
+        for (int j = 0; j < input2_T::size; j++) {
+            #pragma HLS UNROLL
+            out_data[j + (i * input2_T::size) + (CONFIG_T::n_elem1_0)] = in_data2[j];
+        }
+    }
+    res.write(out_data);
+}
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
index edf8f739b9..00d1c6d12b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_mult.h
@@ -1,116 +1,116 @@
-#ifndef NNET_MULT_H_
-#define NNET_MULT_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <iostream>
-#include <math.h>
-
-namespace nnet {
-
-namespace product {
-
-/* ---
- * different methods to perform the product of input and weight, depending on the
- * types of each.
- * --- */
-
-class Product {};
-
-template <class x_T, class w_T> class both_binary : public Product {
-  public:
-    static x_T product(x_T a, w_T w) {
-        // specialisation for 1-bit weights and incoming data
-        #pragma HLS INLINE
-        return a == w;
-    }
-};
-
-template <class x_T, class w_T> class weight_binary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 1-bit weights, arbitrary data
-        #pragma HLS INLINE
-        if (w == 0)
-            return -a;
-        else
-            return a;
-    }
-};
-
-template <class x_T, class w_T> class data_binary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-w) {
-        // Specialisation for 1-bit data, arbitrary weight
-        #pragma HLS INLINE
-        if (a == 0)
-            return -w;
-        else
-            return w;
-    }
-};
-
-template <class x_T, class w_T> class weight_ternary : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(-a) {
-        // Specialisation for 2-bit weights, arbitrary data
-        #pragma HLS INLINE
-        if (w == 0)
-            return 0;
-        else if (w == -1)
-            return -a;
-        else
-            return a; // if(w == 1)
-    }
-};
-
-template <class x_T, class w_T> class mult : public Product {
-  public:
-    static auto product(x_T a, w_T w) -> decltype(a * w) {
-        // 'Normal' product
-        #pragma HLS INLINE
-        return a * w;
-    }
-};
-
-template <class x_T, class w_T> class weight_exponential : public Product {
-  public:
-    using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
-    static r_T product(x_T a, w_T w) {
-        // Shift product for exponential weights
-        #pragma HLS INLINE
-
-        // Shift by the exponent. Negative weights shift right
-        r_T y = static_cast<r_T>(a) << w.weight;
-
-        // Negate or not depending on weight sign
-        return w.sign == 1 ? y : static_cast<r_T>(-y);
-    }
-};
-
-} // namespace product
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<std::is_same<data_T, ap_uint<1>>::value &&
-                                   std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value,
-                               ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
-cast(typename CONFIG_T::accum_t x) {
-    return (ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<
-    std::is_same<data_T, ap_uint<1>>::value && !std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value, res_T>::type
-cast(typename CONFIG_T::accum_t x) {
-    return (res_T)x;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-inline typename std::enable_if<(!std::is_same<data_T, ap_uint<1>>::value), res_T>::type cast(typename CONFIG_T::accum_t x) {
-    return (res_T)x;
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_MULT_H_
+#define NNET_MULT_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <iostream>
+#include <math.h>
+
+namespace nnet {
+
+namespace product {
+
+/* ---
+ * different methods to perform the product of input and weight, depending on the
+ * types of each.
+ * --- */
+
+class Product {};
+
+template <class x_T, class w_T> class both_binary : public Product {
+  public:
+    static x_T product(x_T a, w_T w) {
+        // specialisation for 1-bit weights and incoming data
+        #pragma HLS INLINE
+        return a == w;
+    }
+};
+
+template <class x_T, class w_T> class weight_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 1-bit weights, arbitrary data
+        #pragma HLS INLINE
+        if (w == 0)
+            return -a;
+        else
+            return a;
+    }
+};
+
+template <class x_T, class w_T> class data_binary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-w) {
+        // Specialisation for 1-bit data, arbitrary weight
+        #pragma HLS INLINE
+        if (a == 0)
+            return -w;
+        else
+            return w;
+    }
+};
+
+template <class x_T, class w_T> class weight_ternary : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(-a) {
+        // Specialisation for 2-bit weights, arbitrary data
+        #pragma HLS INLINE
+        if (w == 0)
+            return 0;
+        else if (w == -1)
+            return -a;
+        else
+            return a; // if(w == 1)
+    }
+};
+
+template <class x_T, class w_T> class mult : public Product {
+  public:
+    static auto product(x_T a, w_T w) -> decltype(a * w) {
+        // 'Normal' product
+        #pragma HLS INLINE
+        return a * w;
+    }
+};
+
+template <class x_T, class w_T> class weight_exponential : public Product {
+  public:
+    using r_T = ap_fixed<2 * (decltype(w_T::weight)::width + x_T::width), (decltype(w_T::weight)::width + x_T::width)>;
+    static r_T product(x_T a, w_T w) {
+        // Shift product for exponential weights
+        #pragma HLS INLINE
+
+        // Shift by the exponent. Negative weights shift right
+        r_T y = static_cast<r_T>(a) << w.weight;
+
+        // Negate or not depending on weight sign
+        return w.sign == 1 ? y : static_cast<r_T>(-y);
+    }
+};
+
+} // namespace product
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<std::is_same<data_T, ap_uint<1>>::value &&
+                                   std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value,
+                               ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (ap_int<nnet::ceillog2(CONFIG_T::n_in) + 2>)(x - CONFIG_T::n_in / 2) * 2;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<
+    std::is_same<data_T, ap_uint<1>>::value && !std::is_same<typename CONFIG_T::weight_t, ap_uint<1>>::value, res_T>::type
+cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+inline typename std::enable_if<(!std::is_same<data_T, ap_uint<1>>::value), res_T>::type cast(typename CONFIG_T::accum_t x) {
+    return (res_T)x;
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_padding.h b/hls4ml/templates/vivado/nnet_utils/nnet_padding.h
index 2df5a00705..e48a2fb47e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_padding.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_padding.h
@@ -1,145 +1,145 @@
-#ifndef NNET_PADDING_H_
-#define NNET_PADDING_H_
-
-#include <math.h>
-
-namespace nnet {
-
-struct padding1d_config {
-    static const unsigned n_chan = 10;
-    static const unsigned in_width = 10;
-    static const unsigned out_width = 10;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int j = 0; j < CONFIG_T::n_chan; j++) {
-        for (int i = 0; i < CONFIG_T::pad_left; i++) {
-            *(res++) = 0;
-        }
-
-        for (int i = 0; i < CONFIG_T::in_width; i++) {
-            *(res++) = (res_T) * (data++);
-        }
-
-        for (int i = 0; i < CONFIG_T::pad_right; i++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::pad_left; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_width; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = (res_T) * (data++);
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_right; i++) {
-        for (int j = 0; j < CONFIG_T::n_chan; j++) {
-            *(res++) = 0;
-        }
-    }
-}
-
-struct padding2d_config {
-    static const unsigned n_chan = 10;
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned out_height = 10;
-    static const unsigned out_width = 10;
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int k = 0; k < CONFIG_T::n_chan; k++) {
-
-        for (int i = 0; i < CONFIG_T::pad_top; i++) {
-            for (int j = 0; j < CONFIG_T::out_width; j++) {
-                *(res++) = 0;
-            }
-        }
-
-        for (int i = 0; i < CONFIG_T::in_height; i++) {
-            for (int j = 0; j < CONFIG_T::pad_left; j++) {
-                *(res++) = 0;
-            }
-            for (int j = 0; j < CONFIG_T::in_width; j++) {
-                *(res++) = (res_T) * (data++);
-            }
-            for (int j = 0; j < CONFIG_T::pad_right; j++) {
-                *(res++) = 0;
-            }
-        }
-
-        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-            for (int j = 0; j < CONFIG_T::out_width; j++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
-                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
-    #pragma HLS PIPELINE
-
-    for (int i = 0; i < CONFIG_T::pad_top; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::in_height; i++) {
-        for (int j = 0; j < CONFIG_T::pad_left; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-        for (int j = 0; j < CONFIG_T::in_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = (res_T) * (data++);
-            }
-        }
-        for (int j = 0; j < CONFIG_T::pad_right; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-
-    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
-        for (int j = 0; j < CONFIG_T::out_width; j++) {
-            for (int k = 0; k < CONFIG_T::n_chan; k++) {
-                *(res++) = 0;
-            }
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_PADDING_H_
+#define NNET_PADDING_H_
+
+#include <math.h>
+
+namespace nnet {
+
+struct padding1d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], data_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int j = 0; j < CONFIG_T::n_chan; j++) {
+        for (int i = 0; i < CONFIG_T::pad_left; i++) {
+            *(res++) = 0;
+        }
+
+        for (int i = 0; i < CONFIG_T::in_width; i++) {
+            *(res++) = (res_T) * (data++);
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_right; i++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad1d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_width], res_T res[CONFIG_T::n_chan * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_left; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_width; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = (res_T) * (data++);
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_right; i++) {
+        for (int j = 0; j < CONFIG_T::n_chan; j++) {
+            *(res++) = 0;
+        }
+    }
+}
+
+struct padding2d_config {
+    static const unsigned n_chan = 10;
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned out_height = 10;
+    static const unsigned out_width = 10;
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cf(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  data_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int k = 0; k < CONFIG_T::n_chan; k++) {
+
+        for (int i = 0; i < CONFIG_T::pad_top; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::in_height; i++) {
+            for (int j = 0; j < CONFIG_T::pad_left; j++) {
+                *(res++) = 0;
+            }
+            for (int j = 0; j < CONFIG_T::in_width; j++) {
+                *(res++) = (res_T) * (data++);
+            }
+            for (int j = 0; j < CONFIG_T::pad_right; j++) {
+                *(res++) = 0;
+            }
+        }
+
+        for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+            for (int j = 0; j < CONFIG_T::out_width; j++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void zeropad2d_cl(data_T data[CONFIG_T::n_chan * CONFIG_T::in_height * CONFIG_T::in_width],
+                  res_T res[CONFIG_T::n_chan * CONFIG_T::out_height * CONFIG_T::out_width]) {
+    #pragma HLS PIPELINE
+
+    for (int i = 0; i < CONFIG_T::pad_top; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::in_height; i++) {
+        for (int j = 0; j < CONFIG_T::pad_left; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+        for (int j = 0; j < CONFIG_T::in_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = (res_T) * (data++);
+            }
+        }
+        for (int j = 0; j < CONFIG_T::pad_right; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+
+    for (int i = 0; i < CONFIG_T::pad_bottom; i++) {
+        for (int j = 0; j < CONFIG_T::out_width; j++) {
+            for (int k = 0; k < CONFIG_T::n_chan; k++) {
+                *(res++) = 0;
+            }
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
index c6bec85d40..bb9f0b3f05 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_pooling.h
@@ -1,313 +1,313 @@
-#ifndef NNET_POOLING_H_
-#define NNET_POOLING_H_
-
-#include "nnet_helpers.h"
-#include <iostream>
-
-namespace nnet {
-
-// Return the maximum value from an array
-template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
-    T y = x[0];
-    for (int i = 1; i < N; i++) {
-        y = x[i] > y ? x[i] : y;
-    }
-    return y;
-}
-
-// Return the mean value of an array
-template <typename T, int N, typename accum_t> accum_t avg(T (&x)[N], unsigned length) {
-    accum_t y = 0;
-    for (int i = 0; i < N; i++) {
-        y += x[i];
-    }
-    y /= length;
-    return y;
-}
-
-// Enumeration for pooling operation (max, avg, l2norm pooling)
-enum Pool_Op { Max, Average }; // L2Norm };
-template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N], unsigned length) {
-    switch (op) {
-    case Max:
-        return max<T, N, accum_t>(x);
-    case Average:
-        return avg<T, N, accum_t>(x, length);
-        // case L2Norm: return l2norm<T, N>(x);
-    }
-}
-
-template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
-    return pool_op<T, N, op, accum_t>(x, N);
-}
-
-template <typename T, Pool_Op op> T pad_val() {
-    /*---
-     *- In Tensorflow, pooling ignores the value in the padded cells
-     *- For Avg pooling, return 0 (the divisior is modified to the
-     *- area overlapping the unpadded image.
-     *- For max pooling, return the most negative value for the type.
-     *- TODO this is not really generic, it assumes fixed point or integer T
-    ---*/
-    switch (op) {
-    case Max: {
-        T x = 0;
-        x[x.width - 1] = 1;
-        return x;
-        break;
-    }
-    case Average:
-        return 0;
-    }
-}
-
-struct pooling1d_config {
-    // IO size
-    static const unsigned n_in = 10;
-    static const unsigned pool_width = 2;
-    static const unsigned stride_width = 2;
-    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const bool count_pad = false;
-    // Pooling function
-    static const Pool_Op pool_op = Max;
-};
-
-template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
-    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-    // Add any necessary padding
-
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Loop over input image x in steps of stride
-        for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) {
-            unsigned overlap_pixel = 0;
-            data_T pool[CONFIG_T::pool_width];
-            #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-
-            for (int jj = 0; jj < CONFIG_T::pool_width; jj++) {
-                if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) {
-                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
-                    overlap_pixel++;
-                } else
-                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
-            }
-
-            int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel;
-
-            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
-                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool, patch_size);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit_1d<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        data_T pool[CONFIG_T::n_in];
-        #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
-            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
-        }
-        // do the pooling
-        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool);
-    }
-}
-
-struct pooling2d_config {
-    // IO size
-    static const unsigned in_height = 10;
-    static const unsigned in_width = 10;
-    static const unsigned n_filt = 4;
-    static const unsigned stride_height = 2;
-    static const unsigned stride_width = 2;
-    static const unsigned pool_height = 2;
-    static const unsigned pool_width = 2;
-    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
-    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
-    // Padding
-    static const unsigned pad_top = 0;
-    static const unsigned pad_bottom = 0;
-    static const unsigned pad_left = 0;
-    static const unsigned pad_right = 0;
-    static const bool count_pad = false;
-    // Pooling function
-    static const Pool_Op pool_op = Max;
-    // Reuse factor
-    static const unsigned reuse_factor = 1;
-
-    // Internal data type definitions
-    typedef float accum_t;
-};
-
-template <typename CONFIG_T> constexpr int pool_op_limit() {
-    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-
-        // Loop over input image y in steps of stride
-        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
-            // Loop over input image x in steps of stride
-            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
-                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
-                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-
-                unsigned overlap_pixel = 0;
-
-                // Loop over pool window y
-                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
-                    // Loop over pool window x
-                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
-                        bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top;
-                        bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left;
-                        if (cond1 && cond2) {
-                            unsigned data_idx =
-                                ((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) *
-                                    CONFIG_T::n_filt +
-                                ff;
-                            pool[kk * CONFIG_T::stride_width + ll] = data[data_idx];
-                            overlap_pixel++;
-                        } else
-                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
-                    }
-                }
-
-                int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel;
-
-                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
-                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
-                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
-                            typename CONFIG_T::accum_t>(pool, patch_size);
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    // TODO partition the arrays according to the reuse factor
-    const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
-    // Add padding and reduce input width to area covered by pooling function
-    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
-    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
-    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
-    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
-
-    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
-        // Loop over input image y in steps of stride
-        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
-            // Loop over input image x in steps of stride
-            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
-                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
-                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
-                // Keep track of number of pixels in image vs padding region
-                unsigned img_overlap = 0;
-                // Loop over pool window y
-                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
-                    // Loop over pool window x
-                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
-                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) ||
-                            jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) {
-                            // Add padding
-                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
-                            if (CONFIG_T::count_pad)
-                                img_overlap++;
-                        } else {
-                            pool[kk * CONFIG_T::stride_width + ll] =
-                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
-                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
-                            img_overlap++;
-                        }
-                    }
-                }
-                // do the pooling
-                // TODO in the case of average pooling, need to reduce height * width to area of pool window
-                // not overlapping padding region
-                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
-                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
-                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
-                            typename CONFIG_T::accum_t>(pool);
-                // If the pool op is Average, the zero-padding needs to be removed from the results
-                if (CONFIG_T::pool_op == Average) {
-                    data_T rescale =
-                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
-                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
-                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
-                }
-            }
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
-                         res_T res[CONFIG_T::n_filt]) {
-    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
-    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
-    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
-    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
-
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-
-    const int limit = pool_op_limit<CONFIG_T>();
-    #pragma HLS ALLOCATION instances=pool_op limit=limit function
-
-FiltLoop:
-    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
-        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
-
-    InputLoop:
-        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
-            pool[i] = data[i * CONFIG_T::n_filt + filt];
-        }
-
-        res[filt] = static_cast<res_T>(
-            pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_POOLING_H_
+#define NNET_POOLING_H_
+
+#include "nnet_helpers.h"
+#include <iostream>
+
+namespace nnet {
+
+// Return the maximum value from an array
+template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
+    T y = x[0];
+    for (int i = 1; i < N; i++) {
+        y = x[i] > y ? x[i] : y;
+    }
+    return y;
+}
+
+// Return the mean value of an array
+template <typename T, int N, typename accum_t> accum_t avg(T (&x)[N], unsigned length) {
+    accum_t y = 0;
+    for (int i = 0; i < N; i++) {
+        y += x[i];
+    }
+    y /= length;
+    return y;
+}
+
+// Enumeration for pooling operation (max, avg, l2norm pooling)
+enum Pool_Op { Max, Average }; // L2Norm };
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N], unsigned length) {
+    switch (op) {
+    case Max:
+        return max<T, N, accum_t>(x);
+    case Average:
+        return avg<T, N, accum_t>(x, length);
+        // case L2Norm: return l2norm<T, N>(x);
+    }
+}
+
+template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
+    return pool_op<T, N, op, accum_t>(x, N);
+}
+
+template <typename T, Pool_Op op> T pad_val() {
+    /*---
+     *- In Tensorflow, pooling ignores the value in the padded cells
+     *- For Avg pooling, return 0 (the divisior is modified to the
+     *- area overlapping the unpadded image.
+     *- For max pooling, return the most negative value for the type.
+     *- TODO this is not really generic, it assumes fixed point or integer T
+    ---*/
+    switch (op) {
+    case Max: {
+        T x = 0;
+        x[x.width - 1] = 1;
+        return x;
+        break;
+    }
+    case Average:
+        return 0;
+    }
+}
+
+struct pooling1d_config {
+    // IO size
+    static const unsigned n_in = 10;
+    static const unsigned pool_width = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned n_out = (n_in - pool_width) / stride_width + 1;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit_1d() {
+    return CONFIG_T::n_in * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_out * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add any necessary padding
+
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image x in steps of stride
+        for (int ii = 0; ii < restricted_padded_width; ii += CONFIG_T::stride_width) {
+            unsigned overlap_pixel = 0;
+            data_T pool[CONFIG_T::pool_width];
+            #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+
+            for (int jj = 0; jj < CONFIG_T::pool_width; jj++) {
+                if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) {
+                    pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
+                    overlap_pixel++;
+                } else
+                    pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
+            }
+
+            int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel;
+
+            res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool, patch_size);
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit_1d<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        data_T pool[CONFIG_T::n_in];
+        #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            pool[jj] = data[jj * CONFIG_T::n_filt + ff];
+        }
+        // do the pooling
+        res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool);
+    }
+}
+
+struct pooling2d_config {
+    // IO size
+    static const unsigned in_height = 10;
+    static const unsigned in_width = 10;
+    static const unsigned n_filt = 4;
+    static const unsigned stride_height = 2;
+    static const unsigned stride_width = 2;
+    static const unsigned pool_height = 2;
+    static const unsigned pool_width = 2;
+    static const unsigned out_height = (in_height - pool_height) / stride_height + 1;
+    static const unsigned out_width = (in_width - pool_width) / stride_width + 1;
+    // Padding
+    static const unsigned pad_top = 0;
+    static const unsigned pad_bottom = 0;
+    static const unsigned pad_left = 0;
+    static const unsigned pad_right = 0;
+    static const bool count_pad = false;
+    // Pooling function
+    static const Pool_Op pool_op = Max;
+    // Reuse factor
+    static const unsigned reuse_factor = 1;
+
+    // Internal data type definitions
+    typedef float accum_t;
+};
+
+template <typename CONFIG_T> constexpr int pool_op_limit() {
+    return (CONFIG_T::out_height * CONFIG_T::out_width) * CONFIG_T::n_filt / CONFIG_T::reuse_factor;
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+
+                unsigned overlap_pixel = 0;
+
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top;
+                        bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left;
+                        if (cond1 && cond2) {
+                            unsigned data_idx =
+                                ((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) *
+                                    CONFIG_T::n_filt +
+                                ff;
+                            pool[kk * CONFIG_T::stride_width + ll] = data[data_idx];
+                            overlap_pixel++;
+                        } else
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                    }
+                }
+
+                int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel;
+
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
+                    (jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                            typename CONFIG_T::accum_t>(pool, patch_size);
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                  res_T res[CONFIG_T::out_height * CONFIG_T::out_width * CONFIG_T::n_filt]) {
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    // TODO partition the arrays according to the reuse factor
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION function instances=CONFIG_T::pool_op limit=limit
+    // Add padding and reduce input width to area covered by pooling function
+    static constexpr int full_padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
+    static constexpr int full_padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
+    static constexpr int restricted_padded_width = full_padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width;
+    static constexpr int restricted_padded_height = full_padded_height / CONFIG_T::stride_height * CONFIG_T::stride_height;
+
+    for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
+        // Loop over input image y in steps of stride
+        for (int ii = 0; ii < restricted_padded_height; ii += CONFIG_T::stride_height) {
+            // Loop over input image x in steps of stride
+            for (int jj = 0; jj < restricted_padded_width; jj += CONFIG_T::stride_width) {
+                data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
+                #pragma HLS ARRAY_PARTITION variable=pool complete dim=0
+                // Keep track of number of pixels in image vs padding region
+                unsigned img_overlap = 0;
+                // Loop over pool window y
+                for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
+                    // Loop over pool window x
+                    for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
+                        if (ii + kk < CONFIG_T::pad_top || ii + kk >= (full_padded_height - CONFIG_T::pad_bottom) ||
+                            jj + ll < CONFIG_T::pad_left || jj + ll >= (full_padded_width - CONFIG_T::pad_right)) {
+                            // Add padding
+                            pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
+                            if (CONFIG_T::count_pad)
+                                img_overlap++;
+                        } else {
+                            pool[kk * CONFIG_T::stride_width + ll] =
+                                data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width +
+                                     ff * CONFIG_T::in_width * CONFIG_T::in_height + ll + jj - CONFIG_T::pad_left];
+                            img_overlap++;
+                        }
+                    }
+                }
+                // do the pooling
+                // TODO in the case of average pooling, need to reduce height * width to area of pool window
+                // not overlapping padding region
+                res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                    ff * CONFIG_T::out_height * CONFIG_T::out_width] =
+                    pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
+                            typename CONFIG_T::accum_t>(pool);
+                // If the pool op is Average, the zero-padding needs to be removed from the results
+                if (CONFIG_T::pool_op == Average) {
+                    data_T rescale =
+                        static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
+                    res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
+                        ff * CONFIG_T::out_height * CONFIG_T::out_width] *= rescale;
+                }
+            }
+        }
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_T::n_filt],
+                         res_T res[CONFIG_T::n_filt]) {
+    assert(CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0);
+    assert(CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0);
+    assert(CONFIG_T::pool_width == CONFIG_T::stride_width);
+    assert(CONFIG_T::pool_height == CONFIG_T::stride_height);
+
+    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
+
+    const int limit = pool_op_limit<CONFIG_T>();
+    #pragma HLS ALLOCATION instances=pool_op limit=limit function
+
+FiltLoop:
+    for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
+        data_T pool[CONFIG_T::in_height * CONFIG_T::in_width];
+
+    InputLoop:
+        for (int i = 0; i < CONFIG_T::in_height * CONFIG_T::in_width; i++) {
+            pool[i] = data[i * CONFIG_T::n_filt + filt];
+        }
+
+        res[filt] = static_cast<res_T>(
+            pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h b/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h
index 3e1ebb225d..f68d80663b 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recr_activations.h
@@ -1,56 +1,56 @@
-#ifndef NNET_RECR_ACTIVATION_H_
-#define NNET_RECR_ACTIVATION_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include <math.h>
-
-namespace nnet {
-
-namespace activation {
-
-template <class data_T, class res_T, typename CONFIG_T> class Activation {
-  public:
-    // *************************************************
-    //       Blank Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       Relu Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       Sigmoid Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
-  public:
-    // *************************************************
-    //       TanH Activation
-    // *************************************************
-    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
-    }
-};
-
-} // namespace activation
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_RECR_ACTIVATION_H_
+#define NNET_RECR_ACTIVATION_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_helpers.h"
+#include <math.h>
+
+namespace nnet {
+
+namespace activation {
+
+template <class data_T, class res_T, typename CONFIG_T> class Activation {
+  public:
+    // *************************************************
+    //       Blank Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {} // Nothing to do here
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class relu : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Relu Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::relu<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class sigmoid : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       Sigmoid Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::sigmoid<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+template <class data_T, class res_T, typename CONFIG_T> class tanh : public Activation<data_T, res_T, CONFIG_T> {
+  public:
+    // *************************************************
+    //       TanH Activation
+    // *************************************************
+    static void activation(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+        nnet::tanh<data_T, res_T, CONFIG_T>(data, res);
+    }
+};
+
+} // namespace activation
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
index bd8c0e05a9..d3b96ba5fb 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_recurrent.h
@@ -1,586 +1,586 @@
-#ifndef NNET_RECURSIVE_H_
-#define NNET_RECURSIVE_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include "nnet_recr_activations.h"
-
-namespace nnet {
-
-struct lstm_config {
-    // Internal data type definitions
-    typedef float weight_t;
-    typedef float bias_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 2;
-    static const unsigned n_parts = 20;
-    static const unsigned n_out = 2;
-    static const unsigned n_state = 2;
-    static const unsigned n_4state = 8;
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const unsigned n_zeros = 0;
-    static const bool store_weights_in_bram = false;
-    static const bool use_static = true;
-
-    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
-    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
-};
-// Long Short term Memory NN (LSTM)
-// Resources:
-// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
-// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
-// https://en.wikipedia.org/wiki/Long_short-term_memory
-// Notes:
-//  - LSTM naming conventions adopted from the above links
-//      - s_newstate = activation(U*input + W*state)
-//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
-//  - If softmax is needed on output, perform *outside* this operations
-//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
-//  dense network at the end)
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-    // Initialize the state variable -- will maintain state between function calls
-
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
-    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
-    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
-
-    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
-
-    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        if (iacc > 2 * CONFIG_T::n_state - 1)
-            index = iacc + CONFIG_T::n_state;
-        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
-    }
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
-    }
-
-    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
-        inputacc_ifo, tmpres_ifo);
-
-    // Now for the confusion matrix
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        inputacc_c, tmpres_c);
-
-    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
-    }
-    // Operation: h=act(s)*o
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        s_newstate, s_actstate);
-
-    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
-        #pragma HLS UNROLL
-        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-                 res_T s_newstate[CONFIG_T::n_state],
-                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-    static res_T h_state[CONFIG_T::n_state];
-    static res_T s_state[CONFIG_T::n_state];
-    // Initialize the state variable -- will maintain state between function calls
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
-    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
-    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
-    #pragma HLS ARRAY_PARTITION variable=h_state      complete
-    #pragma HLS ARRAY_PARTITION variable=s_state      complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
-    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
-
-    if (reset_state) {
-        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
-            #pragma HLS UNROLL
-            s_state[i_state] = 0;
-            h_state[i_state] = 0;
-        }
-    }
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
-                                                                                    param_br);
-
-    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        if (iacc > 2 * CONFIG_T::n_state - 1)
-            index = iacc + CONFIG_T::n_state;
-        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
-    }
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
-    }
-
-    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
-        inputacc_ifo, tmpres_ifo);
-
-    // Now for the confusion matrix
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        inputacc_c, tmpres_c);
-
-    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
-        s_newstate[iacc] = s_state[iacc];
-    }
-    // Operation: h=act(s)*o
-    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
-        s_state, s_actstate);
-
-    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
-        #pragma HLS UNROLL
-        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
-        h_newstate[iacc] = h_state[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-
-    res_T h_newstate[CONFIG_T::n_state];
-    res_T s_newstate[CONFIG_T::n_state];
-    data_T data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_newstate[ii] = 0;
-        s_newstate[ii] = 0;
-    }
-    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
-        for (int j = 0; j < CONFIG_T::n_in; j++) {
-            #pragma HLS UNROLL
-            data_in[j] = data[j + iloop * CONFIG_T::n_in];
-        }
-        if (CONFIG_T::use_static)
-            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
-                                                       param_br);
-        else
-            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
-                                                param_br);
-        if (CONFIG_T::n_sequence_out > 1)
-            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
-                #pragma HLS UNROLL
-                res[i] = h_newstate[j];
-            }
-        reset_state = false;
-    }
-    if (CONFIG_T::n_sequence_out == 1)
-        for (int i = 0; i < (CONFIG_T::n_state); i++) {
-            #pragma HLS UNROLL
-            res[i] = h_newstate[i];
-        }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lstm_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
-
-    typename res_T::value_type h_newstate[CONFIG_T::n_state];
-    typename res_T::value_type s_newstate[CONFIG_T::n_state];
-    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_newstate[ii] = 0;
-        s_newstate[ii] = 0;
-    }
-
-    typename data_T::value_type data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-DataPropagation:
-    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
-        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
-            // #pragma HLS PIPELINE
-        }
-        data_T data_pack = data_stream.read();
-    DataPack:
-        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            data_in[i_pack] = data_pack[i_pack];
-        }
-        if (CONFIG_T::use_static)
-            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
-        else
-            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1) {
-            res_T res_pack;
-            PRAGMA_DATA_PACK(res_pack)
-        ResPack_sequences:
-            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-                #pragma HLS UNROLL
-                res_pack[i_pack] = h_newstate[i_pack];
-            }
-            res_stream.write(res_pack);
-        }
-        reset_state = false;
-    }
-
-    if (CONFIG_T::n_sequence_out == 1) {
-        res_T res_pack;
-        PRAGMA_DATA_PACK(res_pack)
-    ResPack:
-        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            res_pack[i_pack] = h_newstate[i_pack];
-        }
-        res_stream.write(res_pack);
-    }
-}
-
-// Struct for the GRU template
-
-struct gru_config {
-    // Internal data type definitions
-    typedef float weight_t;
-    typedef float bias_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 2;
-    static const unsigned n_out = 2;
-    static const unsigned n_state = 2;
-    static const unsigned n_sequence = 2;
-    static const unsigned n_4state = 8;
-    static const unsigned table_size = 1024;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const bool use_static = true;
-    static const bool pytorch_order = false;
-    static const unsigned n_zeros = 0;
-
-    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
-    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
-                                                                                    // weights - refer page in copy!!
-         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-    // Initialize the state variable -- will maintain state between function calls
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
-
-    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
-                                                                                    param_br);
-
-    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
-    // initialized with biases -- DONE
-    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
-    }
-
-    // Activation function Sub layer -- START
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
-
-    // Activation function Sub layer -- END
-
-    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-        else
-            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-    }
-
-    // Assuming reset_after is false
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
-    }
-
-    // Now run the activation on this guy
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
-
-    // Mix the stat with the previous state
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
-                                       h_newstate[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
-        else
-            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
-                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-    // Initialize the state variable -- will maintain state between function calls
-
-    static res_T h_state[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
-    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
-    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
-    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
-    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
-
-    #pragma HLS ARRAY_PARTITION variable=h_state         complete
-    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
-    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
-    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
-
-    if (reset_state) {
-        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
-            #pragma HLS UNROLL
-            h_state[i_h_state] = 0;
-        }
-    }
-
-    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
-    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
-                                                                                    param_br);
-
-    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
-    // initialized with biases -- DONE
-    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc;
-        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
-    }
-
-    // Activation function Sub layer -- START
-    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
-
-    // Activation function Sub layer -- END
-
-    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-        else
-            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
-    }
-
-    // Assuming reset_after is false
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        int index = iacc + CONFIG_T::n_state * 2;
-        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
-    }
-
-    // Now run the activation on this guy
-    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
-                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
-
-    // Mix the stat with the previous state
-    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
-        #pragma HLS UNROLL
-        if (CONFIG_T::pytorch_order)
-            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
-                                    h_state[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
-        else
-            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
-        h_newstate[iacc] = h_state[iacc];
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
-               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-
-    res_T h_state[CONFIG_T::n_state];
-    data_T data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-    #pragma HLS ARRAY_PARTITION variable=h_state complete
-    #pragma HLS ARRAY_PARTITION variable=data_in complete
-
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_state[ii] = 0;
-    }
-    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
-        for (int j = 0; j < CONFIG_T::n_in; j++) {
-            #pragma HLS UNROLL
-            data_in[j] = data[j + iloop * CONFIG_T::n_in];
-        }
-        if (CONFIG_T::use_static)
-            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
-        else
-            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1)
-            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
-                #pragma HLS UNROLL
-                res[i] = h_state[j];
-            }
-        reset_state = false;
-    }
-    if (CONFIG_T::n_sequence_out == 1)
-        for (int i = 0; i < (CONFIG_T::n_state); i++) {
-            #pragma HLS UNROLL
-            res[i] = h_state[i];
-        }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void gru_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
-               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
-               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
-               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
-               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
-
-    typename res_T::value_type h_newstate[CONFIG_T::n_state];
-    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
-    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
-        #pragma HLS UNROLL
-        h_newstate[ii] = 0;
-    }
-
-    typename data_T::value_type data_in[CONFIG_T::n_in];
-    bool reset_state = true;
-
-DataPropagation:
-    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
-        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
-            // #pragma HLS PIPELINE
-        }
-        data_T data_pack = data_stream.read();
-    DataPack:
-        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            data_in[i_pack] = data_pack[i_pack];
-        }
-        if (CONFIG_T::use_static)
-            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
-                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
-        else
-            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
-                                                                                         param, param_zr, param_b, param_br);
-        if (CONFIG_T::n_sequence_out > 1) {
-            res_T res_pack;
-            PRAGMA_DATA_PACK(res_pack)
-        ResPack_sequences:
-            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-                #pragma HLS UNROLL
-                res_pack[i_pack] = h_newstate[i_pack];
-            }
-            res_stream.write(res_pack);
-        }
-        reset_state = false;
-    }
-
-    if (CONFIG_T::n_sequence_out == 1) {
-        res_T res_pack;
-        PRAGMA_DATA_PACK(res_pack)
-    ResPack:
-        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
-            #pragma HLS UNROLL
-            res_pack[i_pack] = h_newstate[i_pack];
-        }
-        res_stream.write(res_pack);
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_RECURSIVE_H_
+#define NNET_RECURSIVE_H_
+
+#include "hls_stream.h"
+#include "nnet_activation.h"
+#include "nnet_common.h"
+#include "nnet_dense.h"
+#include "nnet_recr_activations.h"
+
+namespace nnet {
+
+struct lstm_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_parts = 20;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const unsigned n_zeros = 0;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+// Long Short term Memory NN (LSTM)
+// Resources:
+// https://github.com/nicodjimenez/lstm/blob/master/lstm.py
+// https://github.com/llSourcell/LSTM_Networks/blob/master/LSTM%20Demo.ipynb
+// https://en.wikipedia.org/wiki/Long_short-term_memory
+// Notes:
+//  - LSTM naming conventions adopted from the above links
+//      - s_newstate = activation(U*input + W*state)
+//      - h_output   = activation(U*input + W*state)*activation(s_newstate)
+//  - If softmax is needed on output, perform *outside* this operations
+//  Originall had a version allows for the state in each layer to be saved, moved this to above (this requires are LARGE
+//  dense network at the end)
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+          res_T s_newstate[CONFIG_T::n_state], typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+          typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+          typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+          typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<data_T, res_T, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state, param_r, param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        s_newstate[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_newstate[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_newstate, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        #pragma HLS UNROLL
+        h_newstate[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                 res_T s_newstate[CONFIG_T::n_state],
+                 typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                 typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                 typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                 typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+    static res_T h_state[CONFIG_T::n_state];
+    static res_T s_state[CONFIG_T::n_state];
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_state[CONFIG_T::n_state * 4];
+    typename CONFIG_T::accum_t tmpres_ifo[CONFIG_T::n_state * 3];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_c[CONFIG_T::n_state];         // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_ifo[CONFIG_T::n_state * 3]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_c[CONFIG_T::n_state];       // c-matrix (keras notation)
+    typename CONFIG_T::accum_t s_actstate[CONFIG_T::n_state];
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate   complete
+    #pragma HLS ARRAY_PARTITION variable=h_state      complete
+    #pragma HLS ARRAY_PARTITION variable=s_state      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_ifo   complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_c     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_ifo complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_c   complete
+    #pragma HLS ARRAY_PARTITION variable=s_actstate   complete
+
+    if (reset_state) {
+        for (int i_state = 0; i_state < (CONFIG_T::n_state); i_state++) {
+            #pragma HLS UNROLL
+            s_state[i_state] = 0;
+            h_state[i_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state, param_r,
+                                                                                    param_br);
+
+    for (int iacc = 0; iacc < (3 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        if (iacc > 2 * CONFIG_T::n_state - 1)
+            index = iacc + CONFIG_T::n_state;
+        inputacc_ifo[iacc] = tmpres[index] + tmpres_state[index];
+    }
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_c[iacc] = tmpres[index] + tmpres_state[index];
+    }
+
+    CONFIG_T::template activation_recr<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_LSTM>::activation(
+        inputacc_ifo, tmpres_ifo);
+
+    // Now for the confusion matrix
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        inputacc_c, tmpres_c);
+
+    // Operation: s=g*i+sold*f (update state with buffer to avoid timing issues)
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        s_state[iacc] = tmpres_c[iacc] * tmpres_ifo[iacc] + s_state[iacc] * tmpres_ifo[iacc + (CONFIG_T::n_state)];
+        s_newstate[iacc] = s_state[iacc];
+    }
+    // Operation: h=act(s)*o
+    CONFIG_T::template activation<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::ACT_CONFIG_T>::activation(
+        s_state, s_actstate);
+
+    for (int iacc = 0; iacc < CONFIG_T::n_state; iacc++) {
+        #pragma HLS UNROLL
+        h_state[iacc] = tmpres_ifo[iacc + 2 * (CONFIG_T::n_state)] * s_actstate[iacc];
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    res_T h_newstate[CONFIG_T::n_state];
+    res_T s_newstate[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            #pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                       param_br);
+        else
+            nnet::lstm<data_T, res_T, CONFIG_T>(reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b,
+                                                param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                #pragma HLS UNROLL
+                res[i] = h_newstate[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            #pragma HLS UNROLL
+            res[i] = h_newstate[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void lstm_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 4 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_r[CONFIG_T::n_state * 4 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 4],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 4]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    typename res_T::value_type s_newstate[CONFIG_T::n_state];
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    #pragma HLS ARRAY_PARTITION variable=s_newstate complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+        s_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // #pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::lstm_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        else
+            nnet::lstm<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, s_newstate, param, param_r, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+// Struct for the GRU template
+
+struct gru_config {
+    // Internal data type definitions
+    typedef float weight_t;
+    typedef float bias_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 2;
+    static const unsigned n_out = 2;
+    static const unsigned n_state = 2;
+    static const unsigned n_sequence = 2;
+    static const unsigned n_4state = 8;
+    static const unsigned table_size = 1024;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const bool use_static = true;
+    static const bool pytorch_order = false;
+    static const unsigned n_zeros = 0;
+
+    template <class x_T, class y_T, class config_T> using activation_recr = nnet::activation::relu<x_T, y_T, config_T>;
+    template <class x_T, class y_T, class config_T> using activation = nnet::activation::relu<x_T, y_T, config_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+         typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in], // TODO - Check the layout of the param
+                                                                                    // weights - refer page in copy!!
+         typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+         typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+         typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_newstate, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+        else
+            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
+                                       h_newstate[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
+        else
+            h_newstate[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_newstate[iacc] * tmpres_zr[iacc]);
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_static(bool reset_state, data_T data[CONFIG_T::n_in], res_T h_newstate[CONFIG_T::n_state],
+                typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+                typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+                typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+                typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+    // Initialize the state variable -- will maintain state between function calls
+
+    static res_T h_state[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_zr[CONFIG_T::n_state * 3];
+    typename CONFIG_T::accum_t tmpres_state_h[CONFIG_T::n_state];
+    typename CONFIG_T::accum_t tmpres_zr[CONFIG_T::n_state * 2];   // activated i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t tmpres_h[CONFIG_T::n_state];        // activated c-matrix (keras notation)
+    typename CONFIG_T::accum_t inputacc_zr[CONFIG_T::n_state * 2]; // i,f,o matrices (keras notation)
+    typename CONFIG_T::accum_t inputacc_h[CONFIG_T::n_state];      // c-matrix (keras notation)
+
+    #pragma HLS ARRAY_PARTITION variable=h_state         complete
+    #pragma HLS ARRAY_PARTITION variable=h_newstate      complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres          complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_zr complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_state_h  complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_zr       complete
+    #pragma HLS ARRAY_PARTITION variable=tmpres_h        complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_zr     complete
+    #pragma HLS ARRAY_PARTITION variable=inputacc_h      complete
+
+    if (reset_state) {
+        for (int i_h_state = 0; i_h_state < (CONFIG_T::n_state); i_h_state++) {
+            #pragma HLS UNROLL
+            h_state[i_h_state] = 0;
+        }
+    }
+
+    nnet::dense<data_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config1>(data, tmpres, param, param_b);
+    nnet::dense<res_T, typename CONFIG_T::accum_t, typename CONFIG_T::mult_config2>(h_state, tmpres_state_zr, param_zr,
+                                                                                    param_br);
+
+    // Adding the individual vectors from the multiplication of tmpres = Wx*x(t); tmpres_state_zr = Wh*h(t-1); tmpres
+    // initialized with biases -- DONE
+    for (int iacc = 0; iacc < (2 * CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc;
+        inputacc_zr[iacc] = tmpres[index] + tmpres_state_zr[index];
+    }
+
+    // Activation function Sub layer -- START
+    CONFIG_T::template activation_recr<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                       typename CONFIG_T::ACT_CONFIG_GRU>::activation(inputacc_zr, tmpres_zr);
+
+    // Activation function Sub layer -- END
+
+    // Hadamrd product of r(t) = inputacc_zr[2*n_state:n_state] and h(t-1) = h_newstate
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            tmpres_state_h[iacc] = tmpres_zr[iacc] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+        else
+            tmpres_state_h[iacc] = tmpres_zr[iacc + (CONFIG_T::n_state)] * tmpres_state_zr[iacc + (2 * CONFIG_T::n_state)];
+    }
+
+    // Assuming reset_after is false
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        int index = iacc + CONFIG_T::n_state * 2;
+        inputacc_h[iacc] = tmpres[index] + tmpres_state_h[iacc];
+    }
+
+    // Now run the activation on this guy
+    CONFIG_T::template activation<typename CONFIG_T::accum_t, typename CONFIG_T::weight_t,
+                                  typename CONFIG_T::ACT_CONFIG_T>::activation(inputacc_h, tmpres_h);
+
+    // Mix the stat with the previous state
+    for (int iacc = 0; iacc < (CONFIG_T::n_state); iacc++) {
+        #pragma HLS UNROLL
+        if (CONFIG_T::pytorch_order)
+            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc + (CONFIG_T::n_state)]) +
+                                    h_state[iacc] * tmpres_zr[iacc + (CONFIG_T::n_state)]);
+        else
+            h_state[iacc] = (res_T)(tmpres_h[iacc] * (1 - tmpres_zr[iacc]) + h_state[iacc] * tmpres_zr[iacc]);
+        h_newstate[iacc] = h_state[iacc];
+    }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(data_T data[CONFIG_T::n_sequence * CONFIG_T::n_in], res_T res[CONFIG_T::n_sequence_out * CONFIG_T::n_state],
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    res_T h_state[CONFIG_T::n_state];
+    data_T data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+    #pragma HLS ARRAY_PARTITION variable=h_state complete
+    #pragma HLS ARRAY_PARTITION variable=data_in complete
+
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_state[ii] = 0;
+    }
+    for (int iloop = 0; iloop < CONFIG_T::n_sequence; iloop++) {
+        for (int j = 0; j < CONFIG_T::n_in; j++) {
+            #pragma HLS UNROLL
+            data_in[j] = data[j + iloop * CONFIG_T::n_in];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<data_T, res_T, CONFIG_T>(reset_state, data_in, h_state, param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1)
+            for (int i = CONFIG_T::n_state * iloop, j = 0; i < (CONFIG_T::n_state * (iloop + 1)); i++, j++) {
+                #pragma HLS UNROLL
+                res[i] = h_state[j];
+            }
+        reset_state = false;
+    }
+    if (CONFIG_T::n_sequence_out == 1)
+        for (int i = 0; i < (CONFIG_T::n_state); i++) {
+            #pragma HLS UNROLL
+            res[i] = h_state[i];
+        }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void gru_stack(hls::stream<data_T> &data_stream, hls::stream<res_T> &res_stream,
+               typename CONFIG_T::weight_t param[CONFIG_T::n_state * 3 * CONFIG_T::n_in],
+               typename CONFIG_T::weight_t param_zr[CONFIG_T::n_state * 3 * CONFIG_T::n_state],
+               typename CONFIG_T::bias_t param_b[CONFIG_T::n_state * 3],
+               typename CONFIG_T::bias_t param_br[CONFIG_T::n_state * 3]) {
+
+    typename res_T::value_type h_newstate[CONFIG_T::n_state];
+    #pragma HLS ARRAY_PARTITION variable=h_newstate complete
+    for (int ii = 0; ii < CONFIG_T::n_state; ii++) {
+        #pragma HLS UNROLL
+        h_newstate[ii] = 0;
+    }
+
+    typename data_T::value_type data_in[CONFIG_T::n_in];
+    bool reset_state = true;
+
+DataPropagation:
+    for (int i_in = 0; i_in < CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size; i_in++) {
+        if (CONFIG_T::n_sequence * CONFIG_T::n_in / data_T::size > 1) {
+            // #pragma HLS PIPELINE
+        }
+        data_T data_pack = data_stream.read();
+    DataPack:
+        for (int i_pack = 0; i_pack < data_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            data_in[i_pack] = data_pack[i_pack];
+        }
+        if (CONFIG_T::use_static)
+            nnet::gru_static<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(
+                reset_state, data_in, h_newstate, param, param_zr, param_b, param_br);
+        else
+            nnet::gru<typename data_T::value_type, typename res_T::value_type, CONFIG_T>(reset_state, data_in, h_newstate,
+                                                                                         param, param_zr, param_b, param_br);
+        if (CONFIG_T::n_sequence_out > 1) {
+            res_T res_pack;
+            PRAGMA_DATA_PACK(res_pack)
+        ResPack_sequences:
+            for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+                #pragma HLS UNROLL
+                res_pack[i_pack] = h_newstate[i_pack];
+            }
+            res_stream.write(res_pack);
+        }
+        reset_state = false;
+    }
+
+    if (CONFIG_T::n_sequence_out == 1) {
+        res_T res_pack;
+        PRAGMA_DATA_PACK(res_pack)
+    ResPack:
+        for (int i_pack = 0; i_pack < res_T::size; i_pack++) {
+            #pragma HLS UNROLL
+            res_pack[i_pack] = h_newstate[i_pack];
+        }
+        res_stream.write(res_pack);
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
index c532d2fa14..c4a76ef0c3 100644
--- a/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
+++ b/hls4ml/templates/vivado_accelerator/alveo/krnl_rtl_src/krnl_rtl_control_s_axi.v
@@ -1,422 +1,422 @@
-/**
-* Copyright (C) 2019-2021 Xilinx, Inc
-*
-* Licensed under the Apache License, Version 2.0 (the "License"). You may
-* not use this file except in compliance with the License. A copy of the
-* License is located at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-* License for the specific language governing permissions and limitations
-* under the License.
-*/
-
-`timescale 1ns/1ps
-module krnl_rtl_control_s_axi
-#(parameter
-    C_S_AXI_ADDR_WIDTH = 6,
-    C_S_AXI_DATA_WIDTH = 32
-)(
-    // axi4 lite slave signals
-    input  wire                          ACLK,
-    input  wire                          ARESET,
-    input  wire                          ACLK_EN,
-    input  wire [C_S_AXI_ADDR_WIDTH-1:0] AWADDR,
-    input  wire                          AWVALID,
-    output wire                          AWREADY,
-    input  wire [C_S_AXI_DATA_WIDTH-1:0] WDATA,
-    input  wire [C_S_AXI_DATA_WIDTH/8-1:0] WSTRB,
-    input  wire                          WVALID,
-    output wire                          WREADY,
-    output wire [1:0]                    BRESP,
-    output wire                          BVALID,
-    input  wire                          BREADY,
-    input  wire [C_S_AXI_ADDR_WIDTH-1:0] ARADDR,
-    input  wire                          ARVALID,
-    output wire                          ARREADY,
-    output wire [C_S_AXI_DATA_WIDTH-1:0] RDATA,
-    output wire [1:0]                    RRESP,
-    output wire                          RVALID,
-    input  wire                          RREADY,
-    output wire                          interrupt,
-    // user signals
-    output wire                          ap_start,
-    input  wire                          ap_done,
-    input  wire                          ap_ready,
-    input  wire                          ap_idle,
-    output wire [63:0]                   fifo_in,
-    output wire [63:0]                   fifo_out,
-    output wire [31:0]                   length_r_in,
-    output wire [31:0]                   length_r_out
-);
-//------------------------Address Info-------------------
-// 0x00 : Control signals
-//        bit 0  - ap_start (Read/Write/COH)
-//        bit 1  - ap_done (Read/COR)
-//        bit 2  - ap_idle (Read)
-//        bit 3  - ap_ready (Read)
-//        bit 7  - auto_restart (Read/Write)
-//        others - reserved
-// 0x04 : Global Interrupt Enable Register
-//        bit 0  - Global Interrupt Enable (Read/Write)
-//        others - reserved
-// 0x08 : IP Interrupt Enable Register (Read/Write)
-//        bit 0  - Channel 0 (ap_done)
-//        bit 1  - Channel 1 (ap_ready)
-//        others - reserved
-// 0x0c : IP Interrupt Status Register (Read/TOW)
-//        bit 0  - Channel 0 (ap_done)
-//        bit 1  - Channel 1 (ap_ready)
-//        others - reserved
-// 0x10 : Data signal of fifo_in
-//        bit 31~0 - a[31:0] (Read/Write)
-// 0x14 : Data signal of fifo_in
-//        bit 31~0 - a[63:32] (Read/Write)
-// 0x18 : reserved
-// 0x1c : Data signal of fifo_out
-//        bit 31~0 - b[31:0] (Read/Write)
-// 0x20 : Data signal of fifo_out
-//        bit 31~0 - b[63:32] (Read/Write)
-// 0x24 : reserved
-// 0x28 : Data signal of length_r_in
-//        bit 31~0 - length_r[31:0] (Read/Write)
-// 0x2c : reserved
-// 0x30 : Data signal of length_r_out
-//        bit 31~0 - length_r[31:0] (Read/Write)
-// 0x34 : reserved
-// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
-
-//------------------------Parameter----------------------
-localparam
-    ADDR_AP_CTRL         = 6'h00,
-    ADDR_GIE             = 6'h04,
-    ADDR_IER             = 6'h08,
-    ADDR_ISR             = 6'h0c,
-    ADDR_FIFO_IN_DATA_0  = 6'h10,
-    ADDR_FIFO_IN_DATA_1  = 6'h14,
-    ADDR_FIFO_IN_CTRL    = 6'h18,
-    ADDR_FIFO_OUT_DATA_0 = 6'h1c,
-    ADDR_FIFO_OUT_DATA_1 = 6'h20,
-    ADDR_FIFO_OUT_CTRL   = 6'h24,
-    ADDR_LENGTH_R_IN_DATA_0  = 6'h28,
-    ADDR_LENGTH_R_IN_CTRL    = 6'h2c,
-    ADDR_LENGTH_R_OUT_DATA_0 = 6'h30,
-    ADDR_LENGTH_R_OUT_CTRL   = 6'h34,
-    WRIDLE               = 2'd0,
-    WRDATA               = 2'd1,
-    WRRESP               = 2'd2,
-    RDIDLE               = 2'd0,
-    RDDATA               = 2'd1,
-    ADDR_BITS         = 6;
-
-//------------------------Local signal-------------------
-    reg  [1:0]                    wstate = WRIDLE;
-    reg  [1:0]                    wnext;
-    reg  [ADDR_BITS-1:0]          waddr;
-    wire [31:0]                   wmask;
-    wire                          aw_hs;
-    wire                          w_hs;
-    reg  [1:0]                    rstate = RDIDLE;
-    reg  [1:0]                    rnext;
-    reg  [31:0]                   rdata;
-    wire                          ar_hs;
-    wire [ADDR_BITS-1:0]          raddr;
-    // internal registers
-    wire                          int_ap_idle;
-    wire                          int_ap_ready;
-    reg                           int_ap_done = 1'b0;
-    reg                           int_ap_start = 1'b0;
-    reg                           int_auto_restart = 1'b0;
-    reg                           int_gie = 2'b0;
-    reg  [1:0]                    int_ier = 2'b0;
-    reg  [1:0]                    int_isr = 2'b0;
-    reg  [63:0]                   int_fifo_in      = 64'b0;
-    reg  [63:0]                   int_fifo_out     = 64'b0;
-    reg  [63:0]                   int_length_r_in  = 32'b0;
-    reg  [31:0]                   int_length_r_out = 32'b0;
-
-//------------------------Instantiation------------------
-
-//------------------------AXI write fsm------------------
-assign AWREADY = (~ARESET) & (wstate == WRIDLE);
-assign WREADY  = (wstate == WRDATA);
-assign BRESP   = 2'b00;  // OKAY
-assign BVALID  = (wstate == WRRESP);
-assign wmask   = { {8{WSTRB[3]}}, {8{WSTRB[2]}}, {8{WSTRB[1]}}, {8{WSTRB[0]}} };
-assign aw_hs   = AWVALID & AWREADY;
-assign w_hs    = WVALID & WREADY;
-
-// wstate
-always @(posedge ACLK) begin
-    if (ARESET)
-        wstate <= WRIDLE;
-    else if (ACLK_EN)
-        wstate <= wnext;
-end
-
-// wnext
-always @(*) begin
-    case (wstate)
-        WRIDLE:
-            if (AWVALID)
-                wnext = WRDATA;
-            else
-                wnext = WRIDLE;
-        WRDATA:
-            if (WVALID)
-                wnext = WRRESP;
-            else
-                wnext = WRDATA;
-        WRRESP:
-            if (BREADY)
-                wnext = WRIDLE;
-            else
-                wnext = WRRESP;
-        default:
-            wnext = WRIDLE;
-    endcase
-end
-
-// waddr
-always @(posedge ACLK) begin
-    if (ACLK_EN) begin
-        if (aw_hs)
-            waddr <= AWADDR[ADDR_BITS-1:0];
-    end
-end
-
-//------------------------AXI read fsm-------------------
-assign ARREADY = (~ARESET) && (rstate == RDIDLE);
-assign RDATA   = rdata;
-assign RRESP   = 2'b00;  // OKAY
-assign RVALID  = (rstate == RDDATA);
-assign ar_hs   = ARVALID & ARREADY;
-assign raddr   = ARADDR[ADDR_BITS-1:0];
-
-// rstate
-always @(posedge ACLK) begin
-    if (ARESET)
-        rstate <= RDIDLE;
-    else if (ACLK_EN)
-        rstate <= rnext;
-end
-
-// rnext
-always @(*) begin
-    case (rstate)
-        RDIDLE:
-            if (ARVALID)
-                rnext = RDDATA;
-            else
-                rnext = RDIDLE;
-        RDDATA:
-            if (RREADY & RVALID)
-                rnext = RDIDLE;
-            else
-                rnext = RDDATA;
-        default:
-            rnext = RDIDLE;
-    endcase
-end
-
-// rdata
-always @(posedge ACLK) begin
-    if (ACLK_EN) begin
-        if (ar_hs) begin
-            rdata <= 1'b0;
-            case (raddr)
-                ADDR_AP_CTRL: begin
-                    rdata[0] <= int_ap_start;
-                    rdata[1] <= int_ap_done;
-                    rdata[2] <= int_ap_idle;
-                    rdata[3] <= int_ap_ready;
-                    rdata[7] <= int_auto_restart;
-                end
-                ADDR_GIE: begin
-                    rdata <= int_gie;
-                end
-                ADDR_IER: begin
-                    rdata <= int_ier;
-                end
-                ADDR_ISR: begin
-                    rdata <= int_isr;
-                end
-                ADDR_FIFO_IN_DATA_0: begin
-                    rdata <= int_fifo_in[31:0];
-                end
-                ADDR_FIFO_IN_DATA_1: begin
-                    rdata <= int_fifo_in[63:32];
-                end
-                ADDR_FIFO_OUT_DATA_0: begin
-                    rdata <= int_fifo_out[31:0];
-                end
-                ADDR_FIFO_OUT_DATA_1: begin
-                    rdata <= int_fifo_out[63:32];
-                end
-                ADDR_LENGTH_R_IN_DATA_0: begin
-                    rdata <= int_length_r_in[31:0];
-                end
-                ADDR_LENGTH_R_OUT_DATA_0: begin
-                    rdata <= int_length_r_out[31:0];
-                end
-            endcase
-        end
-    end
-end
-
-
-//------------------------Register logic-----------------
-assign interrupt     = int_gie & (|int_isr);
-assign ap_start      = int_ap_start;
-assign int_ap_idle   = ap_idle;
-assign int_ap_ready  = ap_ready;
-assign fifo_in       = int_fifo_in;
-assign fifo_out      = int_fifo_out;
-assign length_r_in   = int_length_r_in;
-assign length_r_out  = int_length_r_out;
-// int_ap_start
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_ap_start <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0] && WDATA[0])
-            int_ap_start <= 1'b1;
-        else if (int_ap_ready)
-            int_ap_start <= int_auto_restart; // clear on handshake/auto restart
-    end
-end
-
-// int_ap_done
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_ap_done <= 1'b0;
-    else if (ACLK_EN) begin
-        if (ap_done)
-            int_ap_done <= 1'b1;
-        else if (ar_hs && raddr == ADDR_AP_CTRL)
-            int_ap_done <= 1'b0; // clear on read
-    end
-end
-
-// int_auto_restart
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_auto_restart <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0])
-            int_auto_restart <=  WDATA[7];
-    end
-end
-
-// int_gie
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_gie <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_GIE && WSTRB[0])
-            int_gie <= WDATA[0];
-    end
-end
-
-// int_ier
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_ier <= 1'b0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_IER && WSTRB[0])
-            int_ier <= WDATA[1:0];
-    end
-end
-
-// int_isr[0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_isr[0] <= 1'b0;
-    else if (ACLK_EN) begin
-        if (int_ier[0] & ap_done)
-            int_isr[0] <= 1'b1;
-        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
-            int_isr[0] <= int_isr[0] ^ WDATA[0]; // toggle on write
-    end
-end
-
-// int_isr[1]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_isr[1] <= 1'b0;
-    else if (ACLK_EN) begin
-        if (int_ier[1] & ap_ready)
-            int_isr[1] <= 1'b1;
-        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
-            int_isr[1] <= int_isr[1] ^ WDATA[1]; // toggle on write
-    end
-end
-
-// int_fifo_in[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_in[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_IN_DATA_0)
-            int_fifo_in[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_in[31:0] & ~wmask);
-    end
-end
-
-// int_fifo_in[63:32]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_in[63:32] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_IN_DATA_1)
-            int_fifo_in[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_in[63:32] & ~wmask);
-    end
-end
-
-// int_fifo_out[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_out[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_0)
-            int_fifo_out[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_out[31:0] & ~wmask);
-    end
-end
-
-// int_fifo_out[63:32]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_fifo_out[63:32] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_1)
-            int_fifo_out[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_out[63:32] & ~wmask);
-    end
-end
-
-// int_length_r_in[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_length_r_in[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_LENGTH_R_IN_DATA_0)
-            int_length_r_in[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_in[31:0] & ~wmask);
-    end
-end
-
-
-// int_length_r_out[31:0]
-always @(posedge ACLK) begin
-    if (ARESET)
-        int_length_r_out[31:0] <= 0;
-    else if (ACLK_EN) begin
-        if (w_hs && waddr == ADDR_LENGTH_R_OUT_DATA_0)
-            int_length_r_out[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_out[31:0] & ~wmask);
-    end
-end
-
-
-//------------------------Memory logic-------------------
-
-endmodule
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+`timescale 1ns/1ps
+module krnl_rtl_control_s_axi
+#(parameter
+    C_S_AXI_ADDR_WIDTH = 6,
+    C_S_AXI_DATA_WIDTH = 32
+)(
+    // axi4 lite slave signals
+    input  wire                          ACLK,
+    input  wire                          ARESET,
+    input  wire                          ACLK_EN,
+    input  wire [C_S_AXI_ADDR_WIDTH-1:0] AWADDR,
+    input  wire                          AWVALID,
+    output wire                          AWREADY,
+    input  wire [C_S_AXI_DATA_WIDTH-1:0] WDATA,
+    input  wire [C_S_AXI_DATA_WIDTH/8-1:0] WSTRB,
+    input  wire                          WVALID,
+    output wire                          WREADY,
+    output wire [1:0]                    BRESP,
+    output wire                          BVALID,
+    input  wire                          BREADY,
+    input  wire [C_S_AXI_ADDR_WIDTH-1:0] ARADDR,
+    input  wire                          ARVALID,
+    output wire                          ARREADY,
+    output wire [C_S_AXI_DATA_WIDTH-1:0] RDATA,
+    output wire [1:0]                    RRESP,
+    output wire                          RVALID,
+    input  wire                          RREADY,
+    output wire                          interrupt,
+    // user signals
+    output wire                          ap_start,
+    input  wire                          ap_done,
+    input  wire                          ap_ready,
+    input  wire                          ap_idle,
+    output wire [63:0]                   fifo_in,
+    output wire [63:0]                   fifo_out,
+    output wire [31:0]                   length_r_in,
+    output wire [31:0]                   length_r_out
+);
+//------------------------Address Info-------------------
+// 0x00 : Control signals
+//        bit 0  - ap_start (Read/Write/COH)
+//        bit 1  - ap_done (Read/COR)
+//        bit 2  - ap_idle (Read)
+//        bit 3  - ap_ready (Read)
+//        bit 7  - auto_restart (Read/Write)
+//        others - reserved
+// 0x04 : Global Interrupt Enable Register
+//        bit 0  - Global Interrupt Enable (Read/Write)
+//        others - reserved
+// 0x08 : IP Interrupt Enable Register (Read/Write)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x0c : IP Interrupt Status Register (Read/TOW)
+//        bit 0  - Channel 0 (ap_done)
+//        bit 1  - Channel 1 (ap_ready)
+//        others - reserved
+// 0x10 : Data signal of fifo_in
+//        bit 31~0 - a[31:0] (Read/Write)
+// 0x14 : Data signal of fifo_in
+//        bit 31~0 - a[63:32] (Read/Write)
+// 0x18 : reserved
+// 0x1c : Data signal of fifo_out
+//        bit 31~0 - b[31:0] (Read/Write)
+// 0x20 : Data signal of fifo_out
+//        bit 31~0 - b[63:32] (Read/Write)
+// 0x24 : reserved
+// 0x28 : Data signal of length_r_in
+//        bit 31~0 - length_r[31:0] (Read/Write)
+// 0x2c : reserved
+// 0x30 : Data signal of length_r_out
+//        bit 31~0 - length_r[31:0] (Read/Write)
+// 0x34 : reserved
+// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
+
+//------------------------Parameter----------------------
+localparam
+    ADDR_AP_CTRL         = 6'h00,
+    ADDR_GIE             = 6'h04,
+    ADDR_IER             = 6'h08,
+    ADDR_ISR             = 6'h0c,
+    ADDR_FIFO_IN_DATA_0  = 6'h10,
+    ADDR_FIFO_IN_DATA_1  = 6'h14,
+    ADDR_FIFO_IN_CTRL    = 6'h18,
+    ADDR_FIFO_OUT_DATA_0 = 6'h1c,
+    ADDR_FIFO_OUT_DATA_1 = 6'h20,
+    ADDR_FIFO_OUT_CTRL   = 6'h24,
+    ADDR_LENGTH_R_IN_DATA_0  = 6'h28,
+    ADDR_LENGTH_R_IN_CTRL    = 6'h2c,
+    ADDR_LENGTH_R_OUT_DATA_0 = 6'h30,
+    ADDR_LENGTH_R_OUT_CTRL   = 6'h34,
+    WRIDLE               = 2'd0,
+    WRDATA               = 2'd1,
+    WRRESP               = 2'd2,
+    RDIDLE               = 2'd0,
+    RDDATA               = 2'd1,
+    ADDR_BITS         = 6;
+
+//------------------------Local signal-------------------
+    reg  [1:0]                    wstate = WRIDLE;
+    reg  [1:0]                    wnext;
+    reg  [ADDR_BITS-1:0]          waddr;
+    wire [31:0]                   wmask;
+    wire                          aw_hs;
+    wire                          w_hs;
+    reg  [1:0]                    rstate = RDIDLE;
+    reg  [1:0]                    rnext;
+    reg  [31:0]                   rdata;
+    wire                          ar_hs;
+    wire [ADDR_BITS-1:0]          raddr;
+    // internal registers
+    wire                          int_ap_idle;
+    wire                          int_ap_ready;
+    reg                           int_ap_done = 1'b0;
+    reg                           int_ap_start = 1'b0;
+    reg                           int_auto_restart = 1'b0;
+    reg                           int_gie = 2'b0;
+    reg  [1:0]                    int_ier = 2'b0;
+    reg  [1:0]                    int_isr = 2'b0;
+    reg  [63:0]                   int_fifo_in      = 64'b0;
+    reg  [63:0]                   int_fifo_out     = 64'b0;
+    reg  [63:0]                   int_length_r_in  = 32'b0;
+    reg  [31:0]                   int_length_r_out = 32'b0;
+
+//------------------------Instantiation------------------
+
+//------------------------AXI write fsm------------------
+assign AWREADY = (~ARESET) & (wstate == WRIDLE);
+assign WREADY  = (wstate == WRDATA);
+assign BRESP   = 2'b00;  // OKAY
+assign BVALID  = (wstate == WRRESP);
+assign wmask   = { {8{WSTRB[3]}}, {8{WSTRB[2]}}, {8{WSTRB[1]}}, {8{WSTRB[0]}} };
+assign aw_hs   = AWVALID & AWREADY;
+assign w_hs    = WVALID & WREADY;
+
+// wstate
+always @(posedge ACLK) begin
+    if (ARESET)
+        wstate <= WRIDLE;
+    else if (ACLK_EN)
+        wstate <= wnext;
+end
+
+// wnext
+always @(*) begin
+    case (wstate)
+        WRIDLE:
+            if (AWVALID)
+                wnext = WRDATA;
+            else
+                wnext = WRIDLE;
+        WRDATA:
+            if (WVALID)
+                wnext = WRRESP;
+            else
+                wnext = WRDATA;
+        WRRESP:
+            if (BREADY)
+                wnext = WRIDLE;
+            else
+                wnext = WRRESP;
+        default:
+            wnext = WRIDLE;
+    endcase
+end
+
+// waddr
+always @(posedge ACLK) begin
+    if (ACLK_EN) begin
+        if (aw_hs)
+            waddr <= AWADDR[ADDR_BITS-1:0];
+    end
+end
+
+//------------------------AXI read fsm-------------------
+assign ARREADY = (~ARESET) && (rstate == RDIDLE);
+assign RDATA   = rdata;
+assign RRESP   = 2'b00;  // OKAY
+assign RVALID  = (rstate == RDDATA);
+assign ar_hs   = ARVALID & ARREADY;
+assign raddr   = ARADDR[ADDR_BITS-1:0];
+
+// rstate
+always @(posedge ACLK) begin
+    if (ARESET)
+        rstate <= RDIDLE;
+    else if (ACLK_EN)
+        rstate <= rnext;
+end
+
+// rnext
+always @(*) begin
+    case (rstate)
+        RDIDLE:
+            if (ARVALID)
+                rnext = RDDATA;
+            else
+                rnext = RDIDLE;
+        RDDATA:
+            if (RREADY & RVALID)
+                rnext = RDIDLE;
+            else
+                rnext = RDDATA;
+        default:
+            rnext = RDIDLE;
+    endcase
+end
+
+// rdata
+always @(posedge ACLK) begin
+    if (ACLK_EN) begin
+        if (ar_hs) begin
+            rdata <= 1'b0;
+            case (raddr)
+                ADDR_AP_CTRL: begin
+                    rdata[0] <= int_ap_start;
+                    rdata[1] <= int_ap_done;
+                    rdata[2] <= int_ap_idle;
+                    rdata[3] <= int_ap_ready;
+                    rdata[7] <= int_auto_restart;
+                end
+                ADDR_GIE: begin
+                    rdata <= int_gie;
+                end
+                ADDR_IER: begin
+                    rdata <= int_ier;
+                end
+                ADDR_ISR: begin
+                    rdata <= int_isr;
+                end
+                ADDR_FIFO_IN_DATA_0: begin
+                    rdata <= int_fifo_in[31:0];
+                end
+                ADDR_FIFO_IN_DATA_1: begin
+                    rdata <= int_fifo_in[63:32];
+                end
+                ADDR_FIFO_OUT_DATA_0: begin
+                    rdata <= int_fifo_out[31:0];
+                end
+                ADDR_FIFO_OUT_DATA_1: begin
+                    rdata <= int_fifo_out[63:32];
+                end
+                ADDR_LENGTH_R_IN_DATA_0: begin
+                    rdata <= int_length_r_in[31:0];
+                end
+                ADDR_LENGTH_R_OUT_DATA_0: begin
+                    rdata <= int_length_r_out[31:0];
+                end
+            endcase
+        end
+    end
+end
+
+
+//------------------------Register logic-----------------
+assign interrupt     = int_gie & (|int_isr);
+assign ap_start      = int_ap_start;
+assign int_ap_idle   = ap_idle;
+assign int_ap_ready  = ap_ready;
+assign fifo_in       = int_fifo_in;
+assign fifo_out      = int_fifo_out;
+assign length_r_in   = int_length_r_in;
+assign length_r_out  = int_length_r_out;
+// int_ap_start
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_ap_start <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0] && WDATA[0])
+            int_ap_start <= 1'b1;
+        else if (int_ap_ready)
+            int_ap_start <= int_auto_restart; // clear on handshake/auto restart
+    end
+end
+
+// int_ap_done
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_ap_done <= 1'b0;
+    else if (ACLK_EN) begin
+        if (ap_done)
+            int_ap_done <= 1'b1;
+        else if (ar_hs && raddr == ADDR_AP_CTRL)
+            int_ap_done <= 1'b0; // clear on read
+    end
+end
+
+// int_auto_restart
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_auto_restart <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_AP_CTRL && WSTRB[0])
+            int_auto_restart <=  WDATA[7];
+    end
+end
+
+// int_gie
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_gie <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_GIE && WSTRB[0])
+            int_gie <= WDATA[0];
+    end
+end
+
+// int_ier
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_ier <= 1'b0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_IER && WSTRB[0])
+            int_ier <= WDATA[1:0];
+    end
+end
+
+// int_isr[0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_isr[0] <= 1'b0;
+    else if (ACLK_EN) begin
+        if (int_ier[0] & ap_done)
+            int_isr[0] <= 1'b1;
+        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
+            int_isr[0] <= int_isr[0] ^ WDATA[0]; // toggle on write
+    end
+end
+
+// int_isr[1]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_isr[1] <= 1'b0;
+    else if (ACLK_EN) begin
+        if (int_ier[1] & ap_ready)
+            int_isr[1] <= 1'b1;
+        else if (w_hs && waddr == ADDR_ISR && WSTRB[0])
+            int_isr[1] <= int_isr[1] ^ WDATA[1]; // toggle on write
+    end
+end
+
+// int_fifo_in[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_in[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_IN_DATA_0)
+            int_fifo_in[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_in[31:0] & ~wmask);
+    end
+end
+
+// int_fifo_in[63:32]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_in[63:32] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_IN_DATA_1)
+            int_fifo_in[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_in[63:32] & ~wmask);
+    end
+end
+
+// int_fifo_out[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_out[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_0)
+            int_fifo_out[31:0] <= (WDATA[31:0] & wmask) | (int_fifo_out[31:0] & ~wmask);
+    end
+end
+
+// int_fifo_out[63:32]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_fifo_out[63:32] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_FIFO_OUT_DATA_1)
+            int_fifo_out[63:32] <= (WDATA[31:0] & wmask) | (int_fifo_out[63:32] & ~wmask);
+    end
+end
+
+// int_length_r_in[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_length_r_in[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_LENGTH_R_IN_DATA_0)
+            int_length_r_in[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_in[31:0] & ~wmask);
+    end
+end
+
+
+// int_length_r_out[31:0]
+always @(posedge ACLK) begin
+    if (ARESET)
+        int_length_r_out[31:0] <= 0;
+    else if (ACLK_EN) begin
+        if (w_hs && waddr == ADDR_LENGTH_R_OUT_DATA_0)
+            int_length_r_out[31:0] <= (WDATA[31:0] & wmask) | (int_length_r_out[31:0] & ~wmask);
+    end
+end
+
+
+//------------------------Memory logic-------------------
+
+endmodule
diff --git a/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
index b823a7a2e7..c589bcf057 100644
--- a/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
+++ b/hls4ml/templates/vivado_accelerator/alveo/python_drivers/axi_stream_driver.py
@@ -1,101 +1,101 @@
-from datetime import datetime
-
-import numpy as np
-from pynq import Overlay, allocate
-
-
-class NeuralNetworkOverlay(Overlay):
-    def __init__(self, xclbin_name, dtbo=None, download=True, ignore_version=False, device=None):
-        super().__init__(xclbin_name, dtbo=dtbo, download=download, ignore_version=ignore_version, device=device)
-        self.input_buffer = None
-        self.output_buffer = None
-
-    def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None):
-        """Buffer allocation in the accelerator's memory.
-
-        Args:
-            X_shape (list): Input buffer shape.
-            y_shape (list): Output buffer shape.
-            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
-                numpy dtype. Defaults to np.float32.
-
-                It should be set depending on the interface of the accelerator; if it uses 'float'
-                data type for the 'data' AXI-Stream field, 'np.float32' dtype must be used. Instead if it uses
-                'ap_fixed<A,B>', 'np.intA' is the correct dtype to use. Note that A cannot any integer value, but it can
-                assume power of 2 values, i.e., {..., 8, 16, 32, ...}. Check `numpy` documentation for more information.
-                In this case the encoding/decoding has to be computed by the host machine. For example for
-                'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
-                'float' -> 'ap_fixed<16,6>'::
-
-                    def encode(xi):
-                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
-                    def decode(yi):
-                        return yi * 2**-10
-                    encode_v = np.vectorize(encode) # to apply them element-wise
-                    decode_v = np.vectorize(decode)
-
-            trg_in (optional): Input buffer target memory. By default the v++ command set it to HBM[0] for
-                alveo-u50. Defaults to None.
-            trg_out (optional): Output buffer target memory. By default the v++ command set it to HBM[0] for
-                alveo-u50. Defaults to None.
-        """
-        self.input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in)
-        self.output_buffer = allocate(shape=y_shape, dtype=dtype, target=trg_out)
-
-    def predict(self, X, y_shape, dtype=np.float32, debug=False, profile=False, encode=None, decode=None):
-        """Obtain the predictions of the NN implemented in the FPGA.
-
-        Args:
-            X (ndarray): The input tensor.
-            y_shape (list): The shape of the output tensor, needed by the accelerator to set the TLAST bit properly.
-            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
-                numpy dtype. Defaults to np.float32.
-            debug (bool, optional): If set, the function will print information about the data transfers status.
-                Defaults to False.
-            profile (bool, optional): If set, the function will print the performance of the algorithm in terms of
-                inference/s. Defaults to False.
-            encode (Callable, optional): Function to transform the input tensor. Defaults to None.
-            decode (Callable, optional): Function to transform the output tensor. Defaults to None.
-
-        Returns:
-            _type_: A ``np.ndarray`` with a shape equal of ``y_shape`` and ``dtype`` data type.
-        """
-        self.allocate_mem(X_shape=X.shape, y_shape=y_shape, dtype=dtype)
-        if profile:
-            timea = datetime.now()
-        if encode is not None:
-            X = encode(X)
-        in_size = np.prod(X.shape)
-        out_size = np.prod(y_shape)
-        self.input_buffer[:] = X
-        self.input_buffer.sync_to_device()
-        if debug:
-            print("Send OK")
-        self.krnl_rtl_1.call(self.input_buffer, self.output_buffer, in_size, out_size)
-        if debug:
-            print("Kernel call OK")
-        self.output_buffer.sync_from_device()
-        if debug:
-            print("Recieve OK")
-        result = self.output_buffer.copy()
-        if profile:
-            timeb = datetime.now()
-            dts, rate = self._print_dt(timea, timeb, len(X))
-            self.input_buffer.flush()
-            self.output_buffer.flush()
-            self.free()
-            return result, dts, rate
-        self.input_buffer.flush()
-        self.output_buffer.flush()
-        return result
-
-    def free_overlay(self):
-        self.free()
-
-    def _print_dt(self, timea, timeb, N):
-        dt = timeb - timea
-        dts = dt.seconds + dt.microseconds * 10**-6
-        rate = N / dts
-        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
-        print(f"Or {1 / rate * 1e6} us / inferences")
-        return dts, rate
+from datetime import datetime
+
+import numpy as np
+from pynq import Overlay, allocate
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, xclbin_name, dtbo=None, download=True, ignore_version=False, device=None):
+        super().__init__(xclbin_name, dtbo=dtbo, download=download, ignore_version=ignore_version, device=device)
+        self.input_buffer = None
+        self.output_buffer = None
+
+    def allocate_mem(self, X_shape, y_shape, dtype=np.float32, trg_in=None, trg_out=None):
+        """Buffer allocation in the accelerator's memory.
+
+        Args:
+            X_shape (list): Input buffer shape.
+            y_shape (list): Output buffer shape.
+            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
+                numpy dtype. Defaults to np.float32.
+
+                It should be set depending on the interface of the accelerator; if it uses 'float'
+                data type for the 'data' AXI-Stream field, 'np.float32' dtype must be used. Instead if it uses
+                'ap_fixed<A,B>', 'np.intA' is the correct dtype to use. Note that A cannot any integer value, but it can
+                assume power of 2 values, i.e., {..., 8, 16, 32, ...}. Check `numpy` documentation for more information.
+                In this case the encoding/decoding has to be computed by the host machine. For example for
+                'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                'float' -> 'ap_fixed<16,6>'::
+
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+
+            trg_in (optional): Input buffer target memory. By default the v++ command set it to HBM[0] for
+                alveo-u50. Defaults to None.
+            trg_out (optional): Output buffer target memory. By default the v++ command set it to HBM[0] for
+                alveo-u50. Defaults to None.
+        """
+        self.input_buffer = allocate(shape=X_shape, dtype=dtype, target=trg_in)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype, target=trg_out)
+
+    def predict(self, X, y_shape, dtype=np.float32, debug=False, profile=False, encode=None, decode=None):
+        """Obtain the predictions of the NN implemented in the FPGA.
+
+        Args:
+            X (ndarray): The input tensor.
+            y_shape (list): The shape of the output tensor, needed by the accelerator to set the TLAST bit properly.
+            dtype (dtype, optional): The data type of the elements of the input/output tensors. Must be an instance of
+                numpy dtype. Defaults to np.float32.
+            debug (bool, optional): If set, the function will print information about the data transfers status.
+                Defaults to False.
+            profile (bool, optional): If set, the function will print the performance of the algorithm in terms of
+                inference/s. Defaults to False.
+            encode (Callable, optional): Function to transform the input tensor. Defaults to None.
+            decode (Callable, optional): Function to transform the output tensor. Defaults to None.
+
+        Returns:
+            _type_: A ``np.ndarray`` with a shape equal of ``y_shape`` and ``dtype`` data type.
+        """
+        self.allocate_mem(X_shape=X.shape, y_shape=y_shape, dtype=dtype)
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        in_size = np.prod(X.shape)
+        out_size = np.prod(y_shape)
+        self.input_buffer[:] = X
+        self.input_buffer.sync_to_device()
+        if debug:
+            print("Send OK")
+        self.krnl_rtl_1.call(self.input_buffer, self.output_buffer, in_size, out_size)
+        if debug:
+            print("Kernel call OK")
+        self.output_buffer.sync_from_device()
+        if debug:
+            print("Recieve OK")
+        result = self.output_buffer.copy()
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            self.input_buffer.flush()
+            self.output_buffer.flush()
+            self.free()
+            return result, dts, rate
+        self.input_buffer.flush()
+        self.output_buffer.flush()
+        return result
+
+    def free_overlay(self):
+        self.free()
+
+    def _print_dt(self, timea, timeb, N):
+        dt = timeb - timea
+        dts = dt.seconds + dt.microseconds * 10**-6
+        rate = N / dts
+        print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)")
+        print(f"Or {1 / rate * 1e6} us / inferences")
+        return dts, rate
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
index b704c2e0a8..c14aafb8cb 100644
--- a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_lite_design.tcl
@@ -1,26 +1,26 @@
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
-
-set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
-set_property  ip_repo_paths  ${project_name}_prj [current_project]
-update_ip_catalog
-
-# Create Block Designer design
-create_bd_design "design_1"
-create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
-apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
-create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS]
-
-make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
-add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
-
-reset_run impl_1
-reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
-
-open_run impl_1
-report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+# Create Block Designer design
+create_bd_design "design_1"
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${project_name}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${project_name}_axi_0/s_axi_AXILiteS]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
index de86ff4b74..c5549dc256 100644
--- a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_stream_design.tcl
@@ -1,59 +1,59 @@
-#@todo: try to remove startgroup and endgroup and see if it work
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
-
-set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
-set_property  ip_repo_paths  ${project_name}_prj [current_project]
-update_ip_catalog
-
-create_bd_design "design_1"
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
-endgroup
-
-apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
-
-startgroup
-set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
-endgroup
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
-endgroup
-
-set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
-set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
-
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
-
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
-endgroup
-
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
-endgroup
-
-connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
-connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
-
-apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
-
-group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
-
-make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
-
-add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
-
-reset_run impl_1
-reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
-
-open_run impl_1
-report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+
+startgroup
+set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins ${project_name}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
index 033a12d913..5d886c6f25 100644
--- a/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
+++ b/hls4ml/templates/vivado_accelerator/zcu102/tcl_scripts/axi_stream_design.tcl
@@ -1,58 +1,58 @@
-#@todo: try to remove startgroup and endgroup and see if it work
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
-
-set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
-set_property  ip_repo_paths  ${project_name}_prj [current_project]
-update_ip_catalog
-
-create_bd_design "design_1"
-set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
-update_ip_catalog
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
-endgroup
-
-apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
-
-set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
-endgroup
-set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
-set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
-
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
-endgroup
-
-startgroup
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
-apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
-endgroup
-
-startgroup
-create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
-endgroup
-connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
-connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
-
-apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
-group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
-
-make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
-
-add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
-
-reset_run impl_1
-reset_run synth_1
-launch_runs impl_1 -to_step write_bitstream -jobs 6
-wait_on_run -timeout 360 impl_1
-
-open_run impl_1
-report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${project_name}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
+set_property  ip_repo_paths  ${project_name}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
+
+set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+endgroup
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0
+endgroup
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${project_name}_axi_0/ap_clk]
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0]
+
+make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/utils/plot.py b/hls4ml/utils/plot.py
index 24dd4f1cb3..e3424bb1ad 100644
--- a/hls4ml/utils/plot.py
+++ b/hls4ml/utils/plot.py
@@ -1,224 +1,224 @@
-# Heavily inspired by Keras's plot_model
-"""Utilities related to model visualization."""
-
-import os
-import sys
-
-try:
-    import pydot
-except ImportError:
-    pydot = None
-
-
-def check_pydot():
-    """Returns True if PyDot and Graphviz are available."""
-    if pydot is None:
-        return False
-    try:
-        # Attempt to create an image of a blank graph
-        # to check the pydot/graphviz installation.
-        pydot.Dot.create(pydot.Dot())
-        return True
-    except OSError:
-        return False
-
-
-def add_edge(dot, src, dst):
-    if not dot.get_edge(src, dst):
-        dot.add_edge(pydot.Edge(src, dst))
-
-
-def model_to_dot(
-    model, show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96, subgraph=False
-):
-    """Convert a HLS model to dot format.
-
-    Arguments:
-        model: A HLS model instance.
-        show_shapes: whether to display shape information.
-        show_layer_names: whether to display layer names.
-        show_precision: whether to display precision of layer's variables.
-        rankdir: `rankdir` argument passed to PyDot,
-            a string specifying the format of the plot:
-            'TB' creates a vertical plot;
-            'LR' creates a horizontal plot.
-        dpi: Dots per inch.
-        subgraph: whether to return a `pydot.Cluster` instance.
-
-    Returns:
-        A `pydot.Dot` instance representing the HLS model or
-        a `pydot.Cluster` instance representing nested model if
-        `subgraph=True`.
-
-    Raises:
-        ImportError: if graphviz or pydot are not available.
-    """
-
-    if not check_pydot():
-        if 'IPython.core.magics.namespace' in sys.modules:
-            # We don't raise an exception here in order to avoid crashing notebook
-            # tests where graphviz is not available.
-            print('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
-            return
-        else:
-            raise ImportError('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
-
-    if subgraph:
-        dot = pydot.Cluster(style='dashed', graph_name=model.name)
-        dot.set('label', model.name)
-        dot.set('labeljust', 'l')
-    else:
-        dot = pydot.Dot()
-        dot.set('rankdir', rankdir)
-        dot.set('concentrate', True)
-        dot.set('dpi', dpi)
-        dot.set_node_defaults(shape='record')
-
-    layers = model.get_layers()
-
-    # Create graph nodes.
-    for i, layer in enumerate(layers):
-        # layer_id = str(id(layer))
-        layer_id = str(layer.index)
-
-        # Append a wrapped layer's label to node's label, if it exists.
-        layer_name = layer.name
-        class_name = layer.class_name
-
-        # Create node's label.
-        if show_layer_names:
-            # label = '{}: {}'.format(class_name, layer_name)
-            # label = '{}\\l{}\\l'.format(class_name, layer_name)
-            label = f'<b>{class_name}</b><br align="left" />{layer_name}'
-        else:
-            label = class_name
-
-        # Rebuild the label as a table including input/output shapes.
-        if show_shapes:
-
-            def format_shape(shape):
-                return str(tuple(shape)).replace(str(None), '?')
-
-            input_labels = '?'
-            try:
-                output_labels = format_shape(layer.get_output_variable().shape)
-            except AttributeError:
-                output_labels = '?'
-            if class_name != 'Input':
-                if len(layer.inputs) > 1:
-                    input_shapes = []
-                    for i in layer.inputs:
-                        input_layer = layer.get_input_variable(i)
-                        if input_layer is not None:
-                            input_shapes.append(input_layer.shape)
-                        else:
-                            input_shapes.append('?')
-                    formatted_shapes = [format_shape(ishape) for ishape in input_shapes]
-                    input_labels = ', '.join(formatted_shapes)
-                else:
-                    input_layer = layer.get_input_variable()
-                    if input_layer is not None:
-                        input_labels = format_shape(input_layer.shape)
-            label = f'{label}\n|{{input: {input_labels}|output: {output_labels}}}'
-
-        # Rebuild the label as a table including tensor precision.
-        if show_precision:
-
-            def format_precision(precision):
-                return str(precision).replace('<', '&lt;').replace('>', '&gt;')
-
-            precision_labels = []
-            tensors = {}
-            tensors.update(layer.weights)
-            if len(layer.variables) == 1:
-                # A bit cleaner output
-                tensors['output'] = layer.get_output_variable()
-            else:
-                tensors.update(layer.variables)
-            for tensor_name, var in tensors.items():
-                if show_shapes:
-                    # tensor_label = '{} {}: {}'.format(tensor_name,
-                    tensor_label = '<tr><td align="left">{} {}:</td><td align="left">{}</td></tr>'.format(
-                        tensor_name, format_shape(var.shape), format_precision(var.type.precision)
-                    )
-                else:
-                    # tensor_label = '{}: {}'.format(tensor_name,
-                    tensor_label = '<tr><td align="left">{}:</td><td align="left">{}</td></tr>'.format(
-                        tensor_name, format_precision(var.type.precision)
-                    )
-                precision_labels.append(tensor_label)
-            # precision_label = '<br align="left" />'.join(precision_labels)
-            precision_label = ''.join(precision_labels)
-            precision_label = '<table border="0" cellspacing="0">' + precision_label + '</table>'
-            label = f'{label}|{{{precision_label}}}'
-
-        label = '<' + label + '>'
-        node = pydot.Node(layer_id, label=label)
-        dot.add_node(node)
-
-    # Connect nodes with edges.
-    for layer in layers:
-        layer_id = str(layer.index)
-        for input_name in layer.inputs:
-            input_layer = layer.get_input_node(input_name)
-            if input_layer is not None:
-                input_layer_id = str(input_layer.index)
-                add_edge(dot, input_layer_id, layer_id)
-
-    return dot
-
-
-def plot_model(
-    model, to_file='model.png', show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96
-):
-    """Converts a HLS model to dot format and save to a file.
-
-    Arguments:
-        model: A HLS model instance
-        to_file: File name of the plot image.
-        show_shapes: whether to display shape information.
-        show_layer_names: whether to display layer names.
-        show_precision: whether to display precision of layer's variables.
-        rankdir: `rankdir` argument passed to PyDot,
-            a string specifying the format of the plot:
-            'TB' creates a vertical plot;
-            'LR' creates a horizontal plot.
-        dpi: Dots per inch.
-
-    Returns:
-        A Jupyter notebook Image object if Jupyter is installed.
-        This enables in-line display of the model plots in notebooks.
-    """
-    dot = model_to_dot(
-        model,
-        show_shapes=show_shapes,
-        show_layer_names=show_layer_names,
-        show_precision=show_precision,
-        rankdir=rankdir,
-        dpi=dpi,
-    )
-    if dot is None:
-        return
-
-    if to_file is not None:
-        _, extension = os.path.splitext(to_file)
-        if not extension:
-            extension = 'png'
-        else:
-            extension = extension[1:]
-        # Save image to disk.
-        dot.write(to_file, format=extension)
-    else:
-        # Return the image as a Jupyter Image object, to be displayed in-line.
-        # Note that we cannot easily detect whether the code is running in a
-        # notebook, and thus we always return the Image if Jupyter is available.
-        try:
-            import tempfile
-
-            from IPython import display
-
-            temp = tempfile.NamedTemporaryFile(suffix='.png')
-            dot.write(temp.name, format='png')
-            return display.Image(filename=temp.name)
-        except ImportError:
-            pass
+# Heavily inspired by Keras's plot_model
+"""Utilities related to model visualization."""
+
+import os
+import sys
+
+try:
+    import pydot
+except ImportError:
+    pydot = None
+
+
+def check_pydot():
+    """Returns True if PyDot and Graphviz are available."""
+    if pydot is None:
+        return False
+    try:
+        # Attempt to create an image of a blank graph
+        # to check the pydot/graphviz installation.
+        pydot.Dot.create(pydot.Dot())
+        return True
+    except OSError:
+        return False
+
+
+def add_edge(dot, src, dst):
+    if not dot.get_edge(src, dst):
+        dot.add_edge(pydot.Edge(src, dst))
+
+
+def model_to_dot(
+    model, show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96, subgraph=False
+):
+    """Convert a HLS model to dot format.
+
+    Arguments:
+        model: A HLS model instance.
+        show_shapes: whether to display shape information.
+        show_layer_names: whether to display layer names.
+        show_precision: whether to display precision of layer's variables.
+        rankdir: `rankdir` argument passed to PyDot,
+            a string specifying the format of the plot:
+            'TB' creates a vertical plot;
+            'LR' creates a horizontal plot.
+        dpi: Dots per inch.
+        subgraph: whether to return a `pydot.Cluster` instance.
+
+    Returns:
+        A `pydot.Dot` instance representing the HLS model or
+        a `pydot.Cluster` instance representing nested model if
+        `subgraph=True`.
+
+    Raises:
+        ImportError: if graphviz or pydot are not available.
+    """
+
+    if not check_pydot():
+        if 'IPython.core.magics.namespace' in sys.modules:
+            # We don't raise an exception here in order to avoid crashing notebook
+            # tests where graphviz is not available.
+            print('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
+            return
+        else:
+            raise ImportError('Failed to import pydot. You must install pydot' ' and graphviz for `pydotprint` to work.')
+
+    if subgraph:
+        dot = pydot.Cluster(style='dashed', graph_name=model.name)
+        dot.set('label', model.name)
+        dot.set('labeljust', 'l')
+    else:
+        dot = pydot.Dot()
+        dot.set('rankdir', rankdir)
+        dot.set('concentrate', True)
+        dot.set('dpi', dpi)
+        dot.set_node_defaults(shape='record')
+
+    layers = model.get_layers()
+
+    # Create graph nodes.
+    for i, layer in enumerate(layers):
+        # layer_id = str(id(layer))
+        layer_id = str(layer.index)
+
+        # Append a wrapped layer's label to node's label, if it exists.
+        layer_name = layer.name
+        class_name = layer.class_name
+
+        # Create node's label.
+        if show_layer_names:
+            # label = '{}: {}'.format(class_name, layer_name)
+            # label = '{}\\l{}\\l'.format(class_name, layer_name)
+            label = f'<b>{class_name}</b><br align="left" />{layer_name}'
+        else:
+            label = class_name
+
+        # Rebuild the label as a table including input/output shapes.
+        if show_shapes:
+
+            def format_shape(shape):
+                return str(tuple(shape)).replace(str(None), '?')
+
+            input_labels = '?'
+            try:
+                output_labels = format_shape(layer.get_output_variable().shape)
+            except AttributeError:
+                output_labels = '?'
+            if class_name != 'Input':
+                if len(layer.inputs) > 1:
+                    input_shapes = []
+                    for i in layer.inputs:
+                        input_layer = layer.get_input_variable(i)
+                        if input_layer is not None:
+                            input_shapes.append(input_layer.shape)
+                        else:
+                            input_shapes.append('?')
+                    formatted_shapes = [format_shape(ishape) for ishape in input_shapes]
+                    input_labels = ', '.join(formatted_shapes)
+                else:
+                    input_layer = layer.get_input_variable()
+                    if input_layer is not None:
+                        input_labels = format_shape(input_layer.shape)
+            label = f'{label}\n|{{input: {input_labels}|output: {output_labels}}}'
+
+        # Rebuild the label as a table including tensor precision.
+        if show_precision:
+
+            def format_precision(precision):
+                return str(precision).replace('<', '&lt;').replace('>', '&gt;')
+
+            precision_labels = []
+            tensors = {}
+            tensors.update(layer.weights)
+            if len(layer.variables) == 1:
+                # A bit cleaner output
+                tensors['output'] = layer.get_output_variable()
+            else:
+                tensors.update(layer.variables)
+            for tensor_name, var in tensors.items():
+                if show_shapes:
+                    # tensor_label = '{} {}: {}'.format(tensor_name,
+                    tensor_label = '<tr><td align="left">{} {}:</td><td align="left">{}</td></tr>'.format(
+                        tensor_name, format_shape(var.shape), format_precision(var.type.precision)
+                    )
+                else:
+                    # tensor_label = '{}: {}'.format(tensor_name,
+                    tensor_label = '<tr><td align="left">{}:</td><td align="left">{}</td></tr>'.format(
+                        tensor_name, format_precision(var.type.precision)
+                    )
+                precision_labels.append(tensor_label)
+            # precision_label = '<br align="left" />'.join(precision_labels)
+            precision_label = ''.join(precision_labels)
+            precision_label = '<table border="0" cellspacing="0">' + precision_label + '</table>'
+            label = f'{label}|{{{precision_label}}}'
+
+        label = '<' + label + '>'
+        node = pydot.Node(layer_id, label=label)
+        dot.add_node(node)
+
+    # Connect nodes with edges.
+    for layer in layers:
+        layer_id = str(layer.index)
+        for input_name in layer.inputs:
+            input_layer = layer.get_input_node(input_name)
+            if input_layer is not None:
+                input_layer_id = str(input_layer.index)
+                add_edge(dot, input_layer_id, layer_id)
+
+    return dot
+
+
+def plot_model(
+    model, to_file='model.png', show_shapes=False, show_layer_names=True, show_precision=False, rankdir='TB', dpi=96
+):
+    """Converts a HLS model to dot format and save to a file.
+
+    Arguments:
+        model: A HLS model instance
+        to_file: File name of the plot image.
+        show_shapes: whether to display shape information.
+        show_layer_names: whether to display layer names.
+        show_precision: whether to display precision of layer's variables.
+        rankdir: `rankdir` argument passed to PyDot,
+            a string specifying the format of the plot:
+            'TB' creates a vertical plot;
+            'LR' creates a horizontal plot.
+        dpi: Dots per inch.
+
+    Returns:
+        A Jupyter notebook Image object if Jupyter is installed.
+        This enables in-line display of the model plots in notebooks.
+    """
+    dot = model_to_dot(
+        model,
+        show_shapes=show_shapes,
+        show_layer_names=show_layer_names,
+        show_precision=show_precision,
+        rankdir=rankdir,
+        dpi=dpi,
+    )
+    if dot is None:
+        return
+
+    if to_file is not None:
+        _, extension = os.path.splitext(to_file)
+        if not extension:
+            extension = 'png'
+        else:
+            extension = extension[1:]
+        # Save image to disk.
+        dot.write(to_file, format=extension)
+    else:
+        # Return the image as a Jupyter Image object, to be displayed in-line.
+        # Note that we cannot easily detect whether the code is running in a
+        # notebook, and thus we always return the Image if Jupyter is available.
+        try:
+            import tempfile
+
+            from IPython import display
+
+            temp = tempfile.NamedTemporaryFile(suffix='.png')
+            dot.write(temp.name, format='png')
+            return display.Image(filename=temp.name)
+        except ImportError:
+            pass

From afbe00b16527134af0a4ad304af03c0efeff7394 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Tue, 17 Sep 2024 17:28:41 -0400
Subject: [PATCH 36/55] trying to clean the diff

---
 .../backends/vivado/passes/core_templates.py  |  23 ++-
 .../vivado/passes/resource_strategy.py        |  20 ++-
 hls4ml/backends/vivado/vivado_backend.py      |  38 ++---
 hls4ml/model/graph.py                         |   5 +-
 hls4ml/model/layers.py                        | 153 +++++++++---------
 hls4ml/templates/vivado/#vivado_synth.tcl#    |   6 -
 6 files changed, 131 insertions(+), 114 deletions(-)
 delete mode 100644 hls4ml/templates/vivado/#vivado_synth.tcl#

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 41e5796917..c6f39efedb 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -1,6 +1,15 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import Activation, BatchNormalization, LayerNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+from hls4ml.model.layers import (
+    Activation,
+    BatchNormalization,
+    Dense,
+    HardActivation,
+    LayerNormalization,
+    ParametrizedActivation,
+    PReLU,
+    Softmax,
+)
 from hls4ml.model.optimizer.passes.hgq_proxy_model import UnaryLUT
 
 # Dense templates
@@ -98,7 +107,7 @@ class BatchNormalizationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(BatchNormalization, include_header=batchnorm_include_list)
         self.template = batchnorm_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['scale'] = node.get_weights('scale').name
@@ -128,24 +137,28 @@ def format(self, node):
 
 layernorm_include_list = ['nnet_utils/nnet_layernorm.h']
 
+
 class LayerNormalizationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
         super().__init__(LayerNormalization)
         self.template = layernorm_config_template
-    
+
     def format(self, node):
         params = self._default_config_params(node)
         params['n_in'] = node.get_input_variable().size_cpp()
         params['seq_len'] = node.get_attr('seq_len')
-        params['product_type'] = get_backend('vivado').product_type(node.get_input_variable().type.precision, node.get_weights('scale').type.precision)
+        params['product_type'] = get_backend('vivado').product_type(
+            node.get_input_variable().type.precision, node.get_weights('scale').type.precision
+        )
 
         return self.template.format(**params)
 
+
 class LayerNormalizationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(LayerNormalization, include_header=layernorm_include_list)
         self.template = layernorm_function_template
-    
+
     def format(self, node):
         params = self._default_function_params(node)
         params['scale'] = node.get_weights('scale').name
diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py
index daba61a6b0..4ed028fa32 100644
--- a/hls4ml/backends/vivado/passes/resource_strategy.py
+++ b/hls4ml/backends/vivado/passes/resource_strategy.py
@@ -1,14 +1,17 @@
 import numpy as np
 
-from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SeparableConv1D, SeparableConv2D, MultiHeadAttention
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, MultiHeadAttention, SeparableConv1D, SeparableConv2D
 from hls4ml.model.optimizer import OptimizerPass
 
+
 class ApplyResourceStrategy(OptimizerPass):
     '''Transposes the weights to use the dense_resource matrix multiply routine'''
 
     def match(self, node):
-        
-        node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU, MultiHeadAttention))
+
+        node_matches = isinstance(
+            node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU, MultiHeadAttention)
+        )
         is_resource_strategy = node.get_attr('strategy', '').lower() == 'resource'
         already_transformed = node.get_attr('_weights_transposed', False) is True
 
@@ -40,11 +43,12 @@ def transform(self, model, node):
         elif isinstance(node, (LSTM, GRU)):
             node.weights['weight'].data = np.transpose(node.weights['weight'].data)
             node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
-        elif isinstance(node, (MultiHeadAttention)):               
-        #     node.weights['key_weight'].data   = np.transpose(node.weights['key_weight'].data,   axes=[0, 2, 1])
-        #     node.weights['query_weight'].data = np.transpose(node.weights['query_weight'].data, axes=[0, 2, 1])
-        #     node.weights['value_weight'].data = np.transpose(node.weights['value_weight'].data, axes=[0, 2, 1])
-        #     node.weights['attention_output_weight'].data = np.transpose(node.weights['attention_output_weight'].data, axes=[2, 0, 1])
+        elif isinstance(node, (MultiHeadAttention)):
+            # node.weights['key_weight'].data   = np.transpose(node.weights['key_weight'].data,   axes=[0, 2, 1])
+            # node.weights['query_weight'].data = np.transpose(node.weights['query_weight'].data, axes=[0, 2, 1])
+            # node.weights['value_weight'].data = np.transpose(node.weights['value_weight'].data, axes=[0, 2, 1])
+            # node.weights['attention_output_weight'].data =
+            #     np.transpose(node.weights['attention_output_weight'].data, axes=[2, 0, 1])
             print("not transpose")
         else:
             raise Exception(f'Unexpected layer {node.class_name} with resource strategy')
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 20f77334f1..a680dce8cb 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -21,14 +21,14 @@
     GarNet,
     GarNetStack,
     Layer,
+    LayerNormalization,
+    MultiHeadAttention,
     Pooling1D,
     Pooling2D,
     SeparableConv1D,
     SeparableConv2D,
     SimpleRNN,
     Softmax,
-    LayerNormalization, 
-    MultiHeadAttention
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
@@ -500,19 +500,19 @@ def init_softmax(self, layer):
         if 'inv_table_t' not in layer.attributes:
             layer.set_attr('inv_table_t', layer.get_attr('table_t'))
         if 'accum_t' not in layer.attributes:
-            layer.set_attr('accum_t', FixedPrecisionType(width=18, integer=8))  
+            layer.set_attr('accum_t', FixedPrecisionType(width=18, integer=8))
         if 'inv_range' not in layer.attributes:
-            layer.set_attr('inv_range', 128)  
+            layer.set_attr('inv_range', 128)
         if 'exp_range' not in layer.attributes:
-            layer.set_attr('exp_range', 8)  
+            layer.set_attr('exp_range', 8)
         if layer.model.config.is_resource_strategy(layer):
             # 'resource' strategy = 'latency' for Softmax
             # layer.set_attr('implementation', 'latency')
-            layer.set_attr('implementation', 'legacy') # latency legacy stable
-            
+            layer.set_attr('implementation', 'legacy')  # latency legacy stable
+
         else:
             # layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
-            layer.set_attr('implementation', 'legacy') # latency legacy stable
+            layer.set_attr('implementation', 'legacy')  # latency legacy stable
 
         if layer.model.config.get_config_value('IOType') == 'io_parallel':
             assert (
@@ -522,12 +522,13 @@ def init_softmax(self, layer):
     @layer_optimizer(LayerNormalization)
     def init_layernormalization(self, layer):
         if 'table_t' not in layer.attributes:
-            layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=8)))
+            layer.set_attr(
+                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=8))
+            )
         if 'table_size' not in layer.attributes:
-            layer.set_attr('table_size', 2048)  #table size
+            layer.set_attr('table_size', 2048)  # table size
         if 'table_range' not in layer.attributes:
-            layer.set_attr('table_range', 1.0)  #table range
-        
+            layer.set_attr('table_range', 1.0)  # table range
 
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
@@ -602,16 +603,15 @@ def init_mha(self, layer):
         index_t = IntegerPrecisionType(width=1, signed=False)
         layer.set_attr('index_t', index_t)
         if 'table_t' not in layer.attributes:
-            layer.set_attr('table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=24, integer=8)))
+            layer.set_attr(
+                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=24, integer=8))
+            )
         if 'table_size' not in layer.attributes:
             layer.set_attr('table_size', 2048)
         if 'accum_t' not in layer.attributes:
-            layer.set_attr('accum_t', FixedPrecisionType(width= 24, integer=8))  
+            layer.set_attr('accum_t', FixedPrecisionType(width=24, integer=8))
         if 'inv_range' not in layer.attributes:
-            layer.set_attr('inv_range', 128)  
+            layer.set_attr('inv_range', 128)
         if 'exp_range' not in layer.attributes:
             layer.set_attr('exp_range', 8)
-        layer.set_attr('strategy', 'resource')  #latency
-        
-
-
+        layer.set_attr('strategy', 'resource')  # latency
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index 8138d91bb3..b8d84228b3 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -588,8 +588,9 @@ def remove_node(self, node, rewire=True):
                                 next_node.inputs[i] = prev_node.outputs[0]
                                 break
                 else:
-                    # if not node.outputs[0] in self.outputs: ## would this be the key output_vars? because the self.outputs is the model final output
-                    if not node.outputs[0] in self.output_vars.keys(): ## my change
+                    # if not node.outputs[0] in self.outputs:
+                    # would this be the key output_vars? because the self.outputs is the model final output
+                    if not node.outputs[0] in self.output_vars.keys():  # my change
                         raise Exception('Cannot rewire a node without child')
             else:
                 raise Exception('Cannot rewire a node without a parent')
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 742d03a5ac..aff09073c6 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -400,7 +400,6 @@ class Dense(Layer):
         Attribute('n_in'),
         Attribute('n_out'),
         Attribute('seq_len'),
-
         WeightAttribute('weight'),
         WeightAttribute('bias'),
         TypeAttribute('weight'),
@@ -928,6 +927,7 @@ def initialize(self):
         self.add_weights_variable(name='scale', var_name='s{index}', data=scale)
         self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
 
+
 class LayerNormalization(Layer):
     _expected_attributes = [
         Attribute('n_in'),
@@ -935,7 +935,6 @@ class LayerNormalization(Layer):
         Attribute('seq_len'),
         WeightAttribute('scale'),
         WeightAttribute('bias'),
-
         TypeAttribute('scale'),
         TypeAttribute('bias'),
     ]
@@ -956,7 +955,6 @@ def initialize(self):
         self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
 
 
-
 class Merge(Layer):
     def initialize(self):
         assert len(self.inputs) == 2
@@ -1450,7 +1448,6 @@ class MultiHeadAttention(Layer):
         Attribute('head_dim_value'),
         Attribute('feature_dim'),
         Attribute('seq_len'),
-
         WeightAttribute('attention_output_weight'),
         WeightAttribute('attention_output_bias'),
         WeightAttribute('key_weight'),
@@ -1459,7 +1456,6 @@ class MultiHeadAttention(Layer):
         WeightAttribute('query_bias'),
         WeightAttribute('value_weight'),
         WeightAttribute('value_bias'),
-
         TypeAttribute('attention_output_weight'),
         TypeAttribute('attention_output_bias'),
         TypeAttribute('key_weight'),
@@ -1472,18 +1468,18 @@ class MultiHeadAttention(Layer):
 
     def initialize(self):
         weights_source = [
-                ('attention_output', 'kernel'),
-                ('attention_output', 'bias'),
-                ('key', 'kernel'),
-                ('key', 'bias'),
-                ('query', 'kernel'),
-                ('query', 'bias'),
-                ('value', 'kernel'),
-                ('value', 'bias'),
-            ]
-        
-        for lname, wtype in weights_source:                         
-            data = self.model.get_weights_data(self.name, '{lname}/{wtype}'.format(lname=lname, wtype=wtype))
+            ('attention_output', 'kernel'),
+            ('attention_output', 'bias'),
+            ('key', 'kernel'),
+            ('key', 'bias'),
+            ('query', 'kernel'),
+            ('query', 'bias'),
+            ('value', 'kernel'),
+            ('value', 'bias'),
+        ]
+
+        for lname, wtype in weights_source:
+            data = self.model.get_weights_data(self.name, f'{lname}/{wtype}')
             if wtype == 'kernel':
                 vtype = 'weight'
                 if lname in ['key', 'query', 'value']:
@@ -1494,68 +1490,77 @@ def initialize(self):
             else:
                 vtype = 'bias'
 
-            name = '{}_{}'.format(lname, vtype)
-            var_name = '{}_{}{{index}}'.format(lname, vtype)
+            name = f'{lname}_{vtype}'
+            var_name = f'{lname}_{vtype}{{index}}'
             self.add_weights_variable(name=name, var_name=var_name, data=data)
-        
+
         shape = self.attributes['query_shape'][1:]
-        dims = ['seq_out_{}'.format(self.index), 'feature_out_{}'.format(self.index)]
+        dims = [f'seq_out_{self.index}', f'feature_out_{self.index}']
         self.add_output_variable(shape, dims)
 
-layer_map = {
-    'Input'                  : Input,
-    'InputLayer'             : Input,
-    'Activation'             : Activation,
-    'QActivation'            : Activation,
-    'LeakyReLU'              : ParametrizedActivation,
-    'ThresholdedReLU'        : ParametrizedActivation,
-    'ELU'                    : ParametrizedActivation,
-    'PReLU'                  : PReLU,
-    'Softmax'                : Softmax,
-    'TernaryTanh'            : TernaryTanh,
-    'Reshape'                : Reshape,
-    'Dense'                  : Dense,
-    'BinaryDense'            : Dense,
-    'TernaryDense'           : Dense,
-    'QDense'                 : Dense,
-    'Conv1D'                 : Conv1D,
-    'QConv1D'                : Conv1D,
-    'Conv2D'                 : Conv2D,
-    'BinaryConv2D'           : Conv2D,
-    'QConv2D'                : Conv2D,
-    'QConv2DBatchnorm'       : Conv2DBatchnorm,
-    'SeparableConv1D'        : SeparableConv1D,
-    'SeparableConv2D'        : SeparableConv2D,
-    'DepthwiseConv2D'        : DepthwiseConv2D,
-    'BatchNormalization'     : BatchNormalization,
-    'QBatchNormalization'    : BatchNormalization,
-    'MaxPooling1D'           : Pooling1D,
-    'AveragePooling1D'       : Pooling1D,
-    'MaxPooling2D'           : Pooling2D,
-    'AveragePooling2D'       : Pooling2D,
-    'GlobalMaxPooling1D'     : GlobalPooling1D,
-    'GlobalAveragePooling1D' : GlobalPooling1D,
-    'GlobalMaxPooling2D'     : GlobalPooling2D,
-    'GlobalAveragePooling2D' : GlobalPooling2D,
-    'ZeroPadding1D'          : ZeroPadding1D,
-    'ZeroPadding2D'          : ZeroPadding2D,
-    'Merge'                  : Merge,
-    'Dot'                    : Dot,
-    'Concatenate'            : Concatenate,
-    'Resize'                 : Resize,
-    'UpSampling1D'           : Resize,
-    'UpSampling2D'           : Resize,
-    'Transpose'              : Transpose,
-    'Embedding'              : Embedding,
-    'SimpleRNN'              : SimpleRNN,
-    'LSTM'                   : LSTM,
-    'GRU'                    : GRU,
-    'GarNet'                 : GarNet,
-    'GarNetStack'            : GarNetStack,
-    'MultiHeadAttention'     : MultiHeadAttention,
-    'LayerNormalization'     : LayerNormalization,
-
 
+layer_map = {
+    'Input': Input,
+    'InputLayer': Input,
+    'Activation': Activation,
+    'QActivation': Activation,
+    'LeakyReLU': ParametrizedActivation,
+    'ThresholdedReLU': ParametrizedActivation,
+    'ELU': ParametrizedActivation,
+    'PReLU': PReLU,
+    'Softmax': Softmax,
+    'TernaryTanh': TernaryTanh,
+    'HardActivation': HardActivation,
+    'Reshape': Reshape,
+    'Dense': Dense,
+    'BinaryDense': Dense,
+    'TernaryDense': Dense,
+    'QDense': Dense,
+    'Conv1D': Conv1D,
+    'QConv1D': Conv1D,
+    'Conv2D': Conv2D,
+    'BinaryConv2D': Conv2D,
+    'QConv2D': Conv2D,
+    'QConv2DBatchnorm': Conv2DBatchnorm,
+    'SeparableConv1D': SeparableConv1D,
+    'QSeparableConv1D': SeparableConv1D,
+    'DepthwiseConv1D': DepthwiseConv1D,
+    'SeparableConv2D': SeparableConv2D,
+    'QSeparableConv2D': SeparableConv2D,
+    'DepthwiseConv2D': DepthwiseConv2D,
+    'QDepthwiseConv2D': DepthwiseConv2D,
+    'BatchNormalization': BatchNormalization,
+    'QBatchNormalization': BatchNormalization,
+    'MaxPooling1D': Pooling1D,
+    'AveragePooling1D': Pooling1D,
+    'MaxPooling2D': Pooling2D,
+    'AveragePooling2D': Pooling2D,
+    'GlobalMaxPooling1D': GlobalPooling1D,
+    'GlobalAveragePooling1D': GlobalPooling1D,
+    'GlobalMaxPooling2D': GlobalPooling2D,
+    'GlobalAveragePooling2D': GlobalPooling2D,
+    'ZeroPadding1D': ZeroPadding1D,
+    'ZeroPadding2D': ZeroPadding2D,
+    'Merge': Merge,
+    'Dot': Dot,
+    'Concatenate': Concatenate,
+    'Resize': Resize,
+    'UpSampling1D': Resize,
+    'UpSampling2D': Resize,
+    'Transpose': Transpose,
+    'Embedding': Embedding,
+    'SimpleRNN': SimpleRNN,
+    'LSTM': LSTM,
+    'GRU': GRU,
+    'QSimpleRNN': SimpleRNN,
+    'QLSTM': LSTM,
+    'QGRU': GRU,
+    'GarNet': GarNet,
+    'GarNetStack': GarNetStack,
+    'LayerGroup': LayerGroup,
+    'SymbolicExpression': SymbolicExpression,
+    'MultiHeadAttention': MultiHeadAttention,
+    'LayerNormalization': LayerNormalization,
     # TensorFlow-specific layers:
     'BiasAdd': BiasAdd,
 }
diff --git a/hls4ml/templates/vivado/#vivado_synth.tcl# b/hls4ml/templates/vivado/#vivado_synth.tcl#
deleted file mode 100644
index fba1387c5a..0000000000
--- a/hls4ml/templates/vivado/#vivado_synth.tcl#
+++ /dev/null
@@ -1,6 +0,0 @@
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-add_files ${project_name}_prj/solution1/syn/verilog
-synth_design -top ${project_name} -part $part
-report_utilization -file vivado_synth.rpt

From dedf96c73bb3f4ba1c91abc218e95f3652b15090 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Tue, 17 Sep 2024 17:31:34 -0400
Subject: [PATCH 37/55] trying to clean the diff

---
 .../templates/vivado/ap_types/ap_shift_reg.h  | 276 +++++++++---------
 1 file changed, 138 insertions(+), 138 deletions(-)

diff --git a/hls4ml/templates/vivado/ap_types/ap_shift_reg.h b/hls4ml/templates/vivado/ap_types/ap_shift_reg.h
index 1539ba5e61..94dba51e46 100644
--- a/hls4ml/templates/vivado/ap_types/ap_shift_reg.h
+++ b/hls4ml/templates/vivado/ap_types/ap_shift_reg.h
@@ -1,138 +1,138 @@
-/*
-#-  (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved.
-#-
-#-  This file contains confidential and proprietary information
-#-  of Xilinx, Inc. and is protected under U.S. and
-#-  international copyright and other intellectual property
-#-  laws.
-#-
-#-  DISCLAIMER
-#-  This disclaimer is not a license and does not grant any
-#-  rights to the materials distributed herewith. Except as
-#-  otherwise provided in a valid license issued to you by
-#-  Xilinx, and to the maximum extent permitted by applicable
-#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
-#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
-#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
-#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
-#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
-#-  (2) Xilinx shall not be liable (whether in contract or tort,
-#-  including negligence, or under any other theory of
-#-  liability) for any loss or damage of any kind or nature
-#-  related to, arising under or in connection with these
-#-  materials, including for any direct, or any indirect,
-#-  special, incidental, or consequential loss or damage
-#-  (including loss of data, profits, goodwill, or any type of
-#-  loss or damage suffered as a result of any action brought
-#-  by a third party) even if such damage or loss was
-#-  reasonably foreseeable or Xilinx had been advised of the
-#-  possibility of the same.
-#-
-#-  CRITICAL APPLICATIONS
-#-  Xilinx products are not designed or intended to be fail-
-#-  safe, or for use in any application requiring fail-safe
-#-  performance, such as life-support or safety devices or
-#-  systems, Class III medical devices, nuclear facilities,
-#-  applications related to the deployment of airbags, or any
-#-  other applications that could lead to death, personal
-#-  injury, or severe property or environmental damage
-#-  (individually and collectively, "Critical
-#-  Applications"). Customer assumes the sole risk and
-#-  liability of any use of Xilinx products in Critical
-#-  Applications, subject only to applicable laws and
-#-  regulations governing limitations on product liability.
-#-
-#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
-#-  PART OF THIS FILE AT ALL TIMES. 
-#- ************************************************************************
-
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-*/
-
-#ifndef __SIM_AP_SHIFT_REG_H__
-#define __SIM_AP_SHIFT_REG_H__
-
-
-/*
- * This file contains a C++ model of shift register.
- * It defines C level simulation model.
- */
-#ifndef __cplusplus
-#error C++ is required to include this header file
-#else
-
-#include <cassert>
-
-//////////////////////////////////////////////
-// C level simulation model for ap_shift_reg
-//////////////////////////////////////////////
-template<typename __SHIFT_T__, unsigned int __SHIFT_DEPTH__ = 32>
-class ap_shift_reg
-{
-  public:
-    /// Constructors
-    ap_shift_reg() { }
-    ap_shift_reg(const char* name) { }
-    /// Destructor
-    virtual ~ap_shift_reg() { }
-
-  private:
-    /// Make copy constructor and assignment operator private
-    ap_shift_reg(const ap_shift_reg< __SHIFT_T__, __SHIFT_DEPTH__ >& shreg)
-    {
-        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
-            Array[i] = shreg.Array[i];
-    }
-
-    ap_shift_reg& operator = (const ap_shift_reg< __SHIFT_T__,
-        __SHIFT_DEPTH__ >& shreg)
-    {
-        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
-            Array[i] = shreg.Array[i];
-        return *this;
-    }
-
-  public:
-    // Shift the queue, push to back and read from a given address.
-    __SHIFT_T__ shift(__SHIFT_T__ DataIn,
-        unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true)
-    {
-        assert(Addr < __SHIFT_DEPTH__ &&
-            "Out-of-bound shift is found in ap_shift_reg.");
-        __SHIFT_T__ ret = Array[Addr];
-        if (Enable) {
-            for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i)
-                Array[i] = Array[i-1];
-            Array[0] = DataIn;
-        }
-        return ret;
-    }
-
-    // Read from a given address.
-    __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const
-    {
-        assert(Addr < __SHIFT_DEPTH__ &&
-            "Out-of-bound read is found in ap_shift_reg.");
-        return Array[Addr];
-    }
-
-  protected:
-    __SHIFT_T__ Array[__SHIFT_DEPTH__];
-};
-
-#endif //__cplusplus
-
-#endif //__SIM_AP_SHIFT_REG_H__
-
-
+/*
+#-  (c) Copyright 2011-2019 Xilinx, Inc. All rights reserved.
+#-
+#-  This file contains confidential and proprietary information
+#-  of Xilinx, Inc. and is protected under U.S. and
+#-  international copyright and other intellectual property
+#-  laws.
+#-
+#-  DISCLAIMER
+#-  This disclaimer is not a license and does not grant any
+#-  rights to the materials distributed herewith. Except as
+#-  otherwise provided in a valid license issued to you by
+#-  Xilinx, and to the maximum extent permitted by applicable
+#-  law: (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND
+#-  WITH ALL FAULTS, AND XILINX HEREBY DISCLAIMS ALL WARRANTIES
+#-  AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY, INCLUDING
+#-  BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-
+#-  INFRINGEMENT, OR FITNESS FOR ANY PARTICULAR PURPOSE; and
+#-  (2) Xilinx shall not be liable (whether in contract or tort,
+#-  including negligence, or under any other theory of
+#-  liability) for any loss or damage of any kind or nature
+#-  related to, arising under or in connection with these
+#-  materials, including for any direct, or any indirect,
+#-  special, incidental, or consequential loss or damage
+#-  (including loss of data, profits, goodwill, or any type of
+#-  loss or damage suffered as a result of any action brought
+#-  by a third party) even if such damage or loss was
+#-  reasonably foreseeable or Xilinx had been advised of the
+#-  possibility of the same.
+#-
+#-  CRITICAL APPLICATIONS
+#-  Xilinx products are not designed or intended to be fail-
+#-  safe, or for use in any application requiring fail-safe
+#-  performance, such as life-support or safety devices or
+#-  systems, Class III medical devices, nuclear facilities,
+#-  applications related to the deployment of airbags, or any
+#-  other applications that could lead to death, personal
+#-  injury, or severe property or environmental damage
+#-  (individually and collectively, "Critical
+#-  Applications"). Customer assumes the sole risk and
+#-  liability of any use of Xilinx products in Critical
+#-  Applications, subject only to applicable laws and
+#-  regulations governing limitations on product liability.
+#-
+#-  THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS
+#-  PART OF THIS FILE AT ALL TIMES. 
+#- ************************************************************************
+
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef __SIM_AP_SHIFT_REG_H__
+#define __SIM_AP_SHIFT_REG_H__
+
+
+/*
+ * This file contains a C++ model of shift register.
+ * It defines C level simulation model.
+ */
+#ifndef __cplusplus
+#error C++ is required to include this header file
+#else
+
+#include <cassert>
+
+//////////////////////////////////////////////
+// C level simulation model for ap_shift_reg
+//////////////////////////////////////////////
+template<typename __SHIFT_T__, unsigned int __SHIFT_DEPTH__ = 32>
+class ap_shift_reg
+{
+  public:
+    /// Constructors
+    ap_shift_reg() { }
+    ap_shift_reg(const char* name) { }
+    /// Destructor
+    virtual ~ap_shift_reg() { }
+
+  private:
+    /// Make copy constructor and assignment operator private
+    ap_shift_reg(const ap_shift_reg< __SHIFT_T__, __SHIFT_DEPTH__ >& shreg)
+    {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+    }
+
+    ap_shift_reg& operator = (const ap_shift_reg< __SHIFT_T__,
+        __SHIFT_DEPTH__ >& shreg)
+    {
+        for (unsigned i = 0; i < __SHIFT_DEPTH__; ++i)
+            Array[i] = shreg.Array[i];
+        return *this;
+    }
+
+  public:
+    // Shift the queue, push to back and read from a given address.
+    __SHIFT_T__ shift(__SHIFT_T__ DataIn,
+        unsigned int Addr = __SHIFT_DEPTH__ - 1, bool Enable = true)
+    {
+        assert(Addr < __SHIFT_DEPTH__ &&
+            "Out-of-bound shift is found in ap_shift_reg.");
+        __SHIFT_T__ ret = Array[Addr];
+        if (Enable) {
+            for (unsigned int i = __SHIFT_DEPTH__ - 1; i > 0; --i)
+                Array[i] = Array[i-1];
+            Array[0] = DataIn;
+        }
+        return ret;
+    }
+
+    // Read from a given address.
+    __SHIFT_T__ read(unsigned int Addr = __SHIFT_DEPTH__ - 1) const
+    {
+        assert(Addr < __SHIFT_DEPTH__ &&
+            "Out-of-bound read is found in ap_shift_reg.");
+        return Array[Addr];
+    }
+
+  protected:
+    __SHIFT_T__ Array[__SHIFT_DEPTH__];
+};
+
+#endif //__cplusplus
+
+#endif //__SIM_AP_SHIFT_REG_H__
+
+

From a9de9cbd835532bd776fd0d4ddb64400e0da1c35 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Tue, 17 Sep 2024 20:38:19 -0400
Subject: [PATCH 38/55] undo vhdl -> verilog change

---
 hls4ml/templates/vivado/vivado_synth.tcl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hls4ml/templates/vivado/vivado_synth.tcl b/hls4ml/templates/vivado/vivado_synth.tcl
index fba1387c5a..4634b166f6 100644
--- a/hls4ml/templates/vivado/vivado_synth.tcl
+++ b/hls4ml/templates/vivado/vivado_synth.tcl
@@ -1,6 +1,6 @@
-set tcldir [file dirname [info script]]
-source [file join $tcldir project.tcl]
-
-add_files ${project_name}_prj/solution1/syn/verilog
-synth_design -top ${project_name} -part $part
-report_utilization -file vivado_synth.rpt
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+add_files ${project_name}_prj/solution1/syn/vhdl
+synth_design -top ${project_name} -part $part
+report_utilization -file vivado_synth.rpt

From 49313d358b97bec3ebbcab841c1c897b172bdfc7 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Wed, 18 Sep 2024 20:38:45 +0800
Subject: [PATCH 39/55] halfway working layernorm + test

---
 hls4ml/converters/keras/core.py               |   5 +-
 hls4ml/converters/keras_to_hls.py             |   2 +-
 hls4ml/converters/pytorch/core.py             |   4 +
 hls4ml/model/layers.py                        |  13 +-
 hls4ml/model/profiling.py                     |   1 +
 .../vivado/nnet_utils/nnet_layernorm.h        | 273 +-----------------
 test/pytest/test_layernorm.py                 |  47 +++
 7 files changed, 66 insertions(+), 279 deletions(-)
 create mode 100644 test/pytest/test_layernorm.py

diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 003eb111d9..9705a2da66 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -125,7 +125,7 @@ def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader):
 
 
 @keras_handler('LayerNormalization')
-def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader):
     assert 'LayerNormalization' in keras_layer['class_name']
 
     layer = parse_default_keras_layer(keras_layer, input_names)
@@ -147,6 +147,9 @@ def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, c
     layer['n_in'] = in_size
     layer['n_out'] = layer['n_in']
 
+    layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
+    layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
+
     return layer, [shape for shape in input_shapes[0]]
 
 
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 4d008dba9f..b652b5b724 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -339,5 +339,5 @@ def keras_to_hls(config):
     model_arch, reader = get_model_arch(config)
     layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
     print('Creating HLS model')
-    hls_model = ModelGraph(config, reader, layer_list, input_layers, output_layers)
+    hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
     return hls_model
diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py
index d3ba470bf5..89f330ca7f 100644
--- a/hls4ml/converters/pytorch/core.py
+++ b/hls4ml/converters/pytorch/core.py
@@ -31,6 +31,10 @@ def parse_linear_layer(operation, layer_name, input_names, input_shapes, node, c
 
     output_shape = input_shapes[0][:]
     output_shape[-1] = layer['n_out']
+    if len(input_shapes[0]) == 3:
+        layer['seq_len'] = output_shape[-1]
+    else:
+        layer['seq_len'] = 1
 
     return layer, output_shape
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index aff09073c6..9c8ed954ba 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -931,7 +931,6 @@ def initialize(self):
 class LayerNormalization(Layer):
     _expected_attributes = [
         Attribute('n_in'),
-        # Attribute('axis', default=-1),
         Attribute('seq_len'),
         WeightAttribute('scale'),
         WeightAttribute('bias'),
@@ -945,14 +944,14 @@ def initialize(self):
         dims = inp.dim_names
         self.add_output_variable(shape, dims)
 
-        gamma = self.model.get_weights_data(self.name, 'gamma')
-        beta = self.model.get_weights_data(self.name, 'beta')
+        scale = self.get_attr('gamma_data')
+        bias = self.get_attr('beta_data')
 
-        scale = gamma
-        bias = beta
+        scale_precision = self.get_attr('scale_t', default=FixedPrecisionType(width=32, integer=4, signed=True))
+        bias_precision = self.get_attr('bias_t', default=FixedPrecisionType(width=32, integer=4, signed=True))
 
-        self.add_weights_variable(name='scale', var_name='s{index}', data=scale)
-        self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
+        self.add_weights_variable(name='scale', var_name='s{index}', precision=scale_precision, data=scale)
+        self.add_weights_variable(name='bias', var_name='b{index}', precision=bias_precision, data=bias)
 
 
 class Merge(Layer):
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 2cbb4916cd..92749705e4 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -295,6 +295,7 @@ def _keras_lstm(layer):
     {
         'BatchNormalization': _keras_batchnorm,
         'QBatchNormalization': _keras_batchnorm,
+        'LayerNormalization': _keras_layernorm,
         'LSTM': _keras_lstm,
         'QLSTM': _keras_lstm,
     },
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index afed57802b..3417500a05 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -1,33 +1,12 @@
-//
-//    rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
-//
-//    Copyright (C) 2017 EJ Kreinar
-//
-//    This program is free software: you can redistribute it and/or modify
-//    it under the terms of the GNU General Public License as published by
-//    the Free Software Foundation, either version 3 of the License, or
-//    (at your option) any later version.
-//
-//    This program is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU General Public License for more details.
-//
-//    You should have received a copy of the GNU General Public License
-//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-//
-
 #ifndef NNET_LAYERNORM_H_
 #define NNET_LAYERNORM_H_
 
 #include "hls_stream.h"
 #include "nnet_common.h"
 #include "nnet_dense.h"
-#include <iostream>
 #include <math.h>
 
 #include "hls_math.h"
-// #include "ap_fixed.h"
 
 namespace nnet {
 
@@ -65,85 +44,6 @@ template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(typename CO
     }
 }
 
-template <typename CONFIG_T, int N_TABLE> void init_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    float inv_range = 0.5; /// if not acurrate increase this
-    // Inversion function:
-    //   result = 1/sqrt(x)
-    for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
-        float in_val = inv_range * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        if (in_val > 0.0)
-            table_out[ii] = sqrt(in_val);
-        else
-            table_out[ii] = 0.0;
-    }
-}
-
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-// )
-// {
-// #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-// #pragma HLS ARRAY_PARTITION variable=data complete
-// #pragma HLS ARRAY_PARTITION variable=res complete
-
-// int inv_range_inv = (int) 1/ 0.5;
-// typename CONFIG_T::table_t sqr = 0;
-// #ifdef __HLS_SYN__
-//     bool initialized = false;
-//     typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
-// #else
-//     static bool initialized = false;
-//     static typename CONFIG_T::table_t sqr_table[CONFIG_T::table_size];
-// #endif
-//     if (!initialized) {
-//         init_sqr_table<CONFIG_T, CONFIG_T::table_size>(sqr_table);
-//         initialized = true;
-//     }
-
-//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//     data_T sum_cache = 0;
-//     data_T sum_cache2 = 0;
-//     data_T var, mean, diff, inv_sqr;
-//     data_T data_diff[dim];
-//     data_T data_norm[dim];
-
-//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
-//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
-
-//     const data_T k_inv = 1.0/dim;
-//     for (int i = 0; i < dim; ++i){
-//         sum_cache += data[i];
-//     }
-//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-
-//     for (int i = 0; i < dim; ++i){
-//         data_diff[i] = data[i] - mean;
-//         diff = data_diff[i]*data_diff[i];
-//         sum_cache2 += diff;
-//     }
-//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-
-//     int index = var*(CONFIG_T::table_size)*inv_range_inv;
-// 	if (index < 0)   index = 0;
-// 	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-// 	sqr = (typename CONFIG_T::table_t) sqr_table[index];
-//     inv_sqr = 1 / sqr;
-
-//     for (int i = 0; i < dim; ++i){
-//         res[i] = data_diff[i] * inv_sqr * scale[i] + bias[i];
-//     }
-
-// }
-
-//////////////////////
-// Dennis's version //
-//////////////////////
 template <class data_T, class res_T, typename CONFIG_T>
 void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CONFIG_T::n_in / CONFIG_T::seq_len],
                   typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
@@ -170,17 +70,7 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
     typename CONFIG_T::mean_t sum_cache2 = 0;
     typename CONFIG_T::mean_t var, mean, diff;
     typename CONFIG_T::mean_t data_diff[dim];
-    typename CONFIG_T::mean_t data_norm[dim];
-    //    data_T sum_cache = 0;
-    //    data_T sum_cache2 = 0;
-    //    data_T var, mean, diff;
-    ////    typename CONFIG_T::mean_t mean;
-    ////    typename CONFIG_T::var_t var;
-    ////    typename CONFIG_T::diff_t diff;
-    //    data_T data_diff[dim];
-    //    data_T data_norm[dim];
 
-    #pragma HLS ARRAY_PARTITION variable=data_diff complete
     #pragma HLS ARRAY_PARTITION variable=data_diff complete
 
     const typename CONFIG_T::mean_t k_inv = 1.0 / dim;
@@ -188,177 +78,28 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
         sum_cache += static_cast<typename CONFIG_T::mean_t>(data[i]);
     }
     mean = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache, k_inv);
-    //    std::cout << "mean: " << std::endl;
-    //    std::cout << mean << std::endl;
 
     for (int i = 0; i < dim; ++i) {
         data_diff[i] = static_cast<typename CONFIG_T::mean_t>(data[i]) - mean;
         diff = data_diff[i] * data_diff[i];
         sum_cache2 += diff;
-        //        std::cout << "data_diff: " << std::endl;
-        //        std::cout << data_diff[i] << std::endl;
-        //        std::cout << " " << std::endl;
     }
     var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
-    //    std::cout << "var: " << std::endl;
-    //    std::cout << var << std::endl;
-    //    std::cout << " " << std::endl;
 
-    int index = var * (CONFIG_T::table_size)*inv_range_inv;
+    int index = var * (CONFIG_T::table_size) * inv_range_inv;
     if (CONFIG_T::table_range > 1)
         index = var * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
 
-    if (index < 0)
-        index = 0;
+    if (index < 1) // Avoid division by zero
+        index = 1;
     if (index > CONFIG_T::table_size - 1)
         index = CONFIG_T::table_size - 1;
     deno_inver = (typename CONFIG_T::table_t)invert_sqr_table[index];
-    //    std::cout << "deno_inver: " << std::endl;
-    //    std::cout << deno_inver << std::endl;
-    //    std::cout << " " << std::endl;
-
-    //    std::cout << "index: " << std::endl;
-    //    std::cout << index << std::endl;
-    //    std::cout << " " << std::endl;
 
     for (int i = 0; i < dim; ++i) {
         res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
     }
 }
-////////////////////////
-// Original One Ethan's//
-////////////////////////
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-//)
-//{
-//#pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-//#pragma HLS ARRAY_PARTITION variable=data complete
-//#pragma HLS ARRAY_PARTITION variable=res complete
-//
-// int inv_range_inv = (int) 1/ CONFIG_T::table_range;
-// typename CONFIG_T::table_t deno_inver = 0;
-//#ifdef __HLS_SYN__
-//    bool initialized = false;
-//    typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-//#else
-//    static bool initialized = false;
-//    static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-//#endif
-//    if (!initialized) {
-//        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
-//        initialized = true;
-//    }
-//
-//    static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//    data_T sum_cache = 0;
-//    data_T sum_cache2 = 0;
-//    data_T var, mean, diff;
-//    data_T data_diff[dim];
-//    data_T data_norm[dim];
-//
-//    #pragma HLS ARRAY_PARTITION variable=data_diff complete
-//    #pragma HLS ARRAY_PARTITION variable=data_diff complete
-//
-//    const data_T k_inv = 1.0/dim;
-//    for (int i = 0; i < dim; ++i){
-//        sum_cache += data[i];
-//    }
-////    mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-////    std::cout << "mean: " << std::endl;
-////    std::cout << mean << std::endl;
-//
-//    for (int i = 0; i < dim; ++i){
-//        data_diff[i] = data[i] - mean;
-//        diff = data_diff[i]*data_diff[i];
-//        sum_cache2 += diff;
-////        std::cout << "data_diff: " << std::endl;
-////        std::cout << data_diff[i] << std::endl;
-////        std::cout << " " << std::endl;
-//    }
-//    var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-////    std::cout << "var: " << std::endl;
-////    std::cout << var << std::endl;
-////    std::cout << " " << std::endl;
-//
-//    int index = var*(CONFIG_T::table_size)*inv_range_inv;
-//    if (CONFIG_T::table_range > 1) index = var*(CONFIG_T::table_size)/ (int)CONFIG_T::table_range;
-//
-//	if (index < 0)   index = 0;
-//	if (index > CONFIG_T::table_size-1) index = CONFIG_T::table_size-1;
-//	deno_inver = (typename CONFIG_T::table_t) invert_sqr_table[index];
-////    std::cout << "deno_inver: " << std::endl;
-////    std::cout << deno_inver << std::endl;
-////    std::cout << " " << std::endl;
-//
-////    std::cout << "index: " << std::endl;
-////    std::cout << index << std::endl;
-////    std::cout << " " << std::endl;
-//
-//    for (int i = 0; i < dim; ++i){
-//        res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
-//    }
-//
-//}
-
-// template<class data_T, class res_T, typename CONFIG_T>
-// void layernorm_1d(
-//     data_T    data[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     res_T     res[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::scale_t  scale[CONFIG_T::n_in/CONFIG_T::seq_len],
-//     typename CONFIG_T::bias_t   bias[CONFIG_T::n_in/CONFIG_T::seq_len]
-// )
-// {
-// #pragma HLS PIPELINE
-// #pragma HLS ARRAY_PARTITION variable=data complete
-// #pragma HLS ARRAY_PARTITION variable=res complete
-
-//     static const unsigned dim = CONFIG_T::n_in/CONFIG_T::seq_len;
-//     data_T sum_cache = 0;
-//     data_T sum_cache2 = 0;
-//     data_T var, mean, diff_squares, diff, var_eps_inv;
-//     data_T data_diff[dim];
-//     float sqrt_var_eps;
-
-//     #pragma HLS ARRAY_PARTITION variable=data_diff complete
-
-//     const data_T k_inv = 1.0/dim;
-//     for (int i = 0; i < dim; ++i){
-//         sum_cache += data[i];
-//     }
-//     mean = CONFIG_T::template product<data_T, data_T>::product(sum_cache, k_inv);
-//     // std::cout << "mean: " << std::endl;
-//     // std::cout << mean << std::endl;
-
-//     for (int i = 0; i < dim; ++i){
-//         diff = data[i] - mean;
-//         data_diff[i] = diff;
-//         diff_squares = diff*diff;
-//         sum_cache2 += diff_squares;
-//         // std::cout << "data_diff: " << std::endl;
-//         // std::cout << data_diff[i] << std::endl;
-//         // std::cout << " " << std::endl;
-//     }
-//     var = CONFIG_T::template product<data_T, data_T>::product(sum_cache2, k_inv);
-//     float var_f = (float)var;
-//     // std::cout << "var: ";
-//     // std::cout << var << std::endl;
-
-//     sqrt_var_eps = sqrt(var_f);
-//     var_eps_inv = (data_T) (1 / (sqrt_var_eps));
-//     // std::cout << "var_eps_inv: " << std::endl;
-//     // std::cout << var_eps_inv << std::endl;
-//     // std::cout << " " << std::endl;
-
-//     for (int i = 0; i < dim; ++i){
-//         res[i] = data_diff[i] * var_eps_inv * scale[i] + bias[i];
-//     }
-
-// }
 
 template <class data_T, class res_T, typename CONFIG_T>
 void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
@@ -370,15 +111,11 @@ void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
     // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
     #pragma HLS function_instantiate variable=scale,bias
 
-    // #pragma HLS ARRAY_PARTITION variable=weights complete // remove this line for now, it breaks compression sometimes
     #pragma HLS ARRAY_PARTITION variable=scale complete
     #pragma HLS ARRAY_PARTITION variable=bias complete
     #pragma HLS ARRAY_PARTITION variable=in_val complete
     #pragma HLS ARRAY_PARTITION variable=outval complete
 
-    // std::cout << "one seq norm layer: " << std::endl;
-    // std::cout << " " << std::endl;
-
     for (int j = 0; j < CONFIG_T::seq_len; ++j) {
         #pragma HLS PIPELINE
     load:
@@ -393,10 +130,6 @@ void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
             res[j * dim + i] = outval[i];
         }
     }
-
-    //     std::cout << "out Dense: " << std::endl;
-    //     nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
-    //     std::cout << " " << std::endl;
 }
 
 } // namespace nnet
diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
new file mode 100644
index 0000000000..cd6d86b57c
--- /dev/null
+++ b/test/pytest/test_layernorm.py
@@ -0,0 +1,47 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+import tensorflow as tf
+from tensorflow.keras.layers import Input, LayerNormalization
+from tensorflow.keras.models import Sequential
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+in_shape = (4, 5)
+atol = 5e-3
+
+
+@pytest.fixture(scope='module')
+def data():
+    np.random.seed(0)
+    X = np.random.rand(1, *in_shape)
+    return X
+
+
+@pytest.fixture(scope='module')
+def model():
+    model = Sequential()
+    model.add(Input(shape=(*in_shape,)))
+    model.add(LayerNormalization(dtype=tf.float32))
+    model.compile()
+    model.layers[0].set_weights([np.ones((in_shape[1],)), np.zeros((in_shape[1],))])
+    return model
+
+# Currently only Vivado in io_parallel mode is supported
+def test_layernorm(model, data):
+    config = hls4ml.utils.config_from_keras_model(
+        model, granularity='name', backend='Vivado'
+    )
+    output_dir = str(test_root_path / f'hls4mlprj_layernorm_Vivado_io_parallel')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
+    )
+    hls_model.compile()
+
+    # Predict
+    y_keras = model.predict(data).flatten()
+    y_hls = hls_model.predict(data)
+    np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)

From 1156ba544358142267a5e4ded6e0a22c83366ff4 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Thu, 19 Sep 2024 00:02:17 +0800
Subject: [PATCH 40/55] layernorm is now pretty functional

---
 .../backends/vivado/passes/core_templates.py  |  1 +
 hls4ml/converters/keras/core.py               |  4 ++
 hls4ml/model/layers.py                        |  5 +-
 .../vivado/nnet_utils/nnet_layernorm.h        | 67 +++++++++++++------
 test/pytest/test_layernorm.py                 |  6 +-
 5 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index c6f39efedb..68ef21b882 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -126,6 +126,7 @@ def format(self, node):
     static const unsigned io_type = nnet::{iotype};
     static const unsigned reuse_factor = {reuse};
     static const bool store_weights_in_bram = false;
+    static constexpr double epsilon = {epsilon};
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;
     typedef {table_t.name} table_t;
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 9705a2da66..3a55f70231 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -150,6 +150,10 @@ def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader):
     layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
     layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
 
+    layer['epsilon'] = keras_layer['config']['epsilon']
+    if layer['epsilon'] <= 0:
+        raise Exception('epsilon must be positive')
+
     return layer, [shape for shape in input_shapes[0]]
 
 
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 9c8ed954ba..5308129748 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -932,6 +932,7 @@ class LayerNormalization(Layer):
     _expected_attributes = [
         Attribute('n_in'),
         Attribute('seq_len'),
+        Attribute('epsilon', value_type=float, default=1e-3),
         WeightAttribute('scale'),
         WeightAttribute('bias'),
         TypeAttribute('scale'),
@@ -947,8 +948,8 @@ def initialize(self):
         scale = self.get_attr('gamma_data')
         bias = self.get_attr('beta_data')
 
-        scale_precision = self.get_attr('scale_t', default=FixedPrecisionType(width=32, integer=4, signed=True))
-        bias_precision = self.get_attr('bias_t', default=FixedPrecisionType(width=32, integer=4, signed=True))
+        scale_precision = self.get_attr('scale_t', default=FixedPrecisionType(width=16, integer=6, signed=True))
+        bias_precision = self.get_attr('bias_t', default=FixedPrecisionType(width=16, integer=6, signed=True))
 
         self.add_weights_variable(name='scale', var_name='s{index}', precision=scale_precision, data=scale)
         self.add_weights_variable(name='bias', var_name='b{index}', precision=bias_precision, data=bias)
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index 3417500a05..fe142c6ce3 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -14,7 +14,7 @@ struct layernorm_config {
     // Internal data type definitions
     typedef float bias_t;
     typedef float scale_t;
-    typedef ap_fixed<16, 8> mean_t;
+    typedef ap_fixed<24, 6> mean_t;
 
     // Layer Sizes
     static const unsigned n_in = 20;
@@ -29,21 +29,52 @@ struct layernorm_config {
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
 
-template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
-    float inv_range = CONFIG_T::table_range;
+template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(
+    typename CONFIG_T::table_t table_in[N_TABLE],
+    typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Inversion function:
     //   result = 1/sqrt(x)
+    // Use log spacing to get more precision at lower values
+    float log_min = log(CONFIG_T::epsilon);
+    float log_max = log(CONFIG_T::table_range);
+    float log_step = (log_max - log_min) / (float)(N_TABLE - 1);
+    float log_val = log_min;
     for (int ii = 0; ii < N_TABLE; ii++) {
-        // First, convert from table index to X-value (signed 8-bit, range 0 to +0.01)
-        float in_val = inv_range * ii / float(N_TABLE);
-        // Next, compute lookup table function
-        if (in_val > 0.0)
-            table_out[ii] = 1.0 / sqrt(in_val);
-        else
-            table_out[ii] = 0.0;
+        float in_val = exp(log_val);
+        table_in[ii] = (typename CONFIG_T::table_t)in_val;
+        table_out[ii] = (typename CONFIG_T::table_t)(1.0 / sqrt(in_val));
+        log_val += log_step;
     }
 }
 
+template <typename CONFIG_T> void lookup_invert_sqr(
+    typename CONFIG_T::mean_t x,
+    typename CONFIG_T::table_t &res,
+    typename CONFIG_T::table_t table_in[CONFIG_T::table_size],
+    typename CONFIG_T::table_t table_out[CONFIG_T::table_size]) {
+    if (x <= table_in[0]) {
+        res = table_out[0];
+        return;
+    } else if (x >= table_in[CONFIG_T::table_size - 1]) {
+        res = table_out[CONFIG_T::table_size - 1];
+        return;
+    }
+
+    // Binary search
+    int low = 0;
+    int high = CONFIG_T::table_size - 1;
+    while (high - low > 1) {
+        int mid = (low + high) / 2;
+        if (x > table_in[mid]) {
+            low = mid;
+        } else {
+            high = mid;
+        }
+    }
+
+    res = table_out[low];
+}
+
 template <class data_T, class res_T, typename CONFIG_T>
 void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CONFIG_T::n_in / CONFIG_T::seq_len],
                   typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
@@ -56,12 +87,14 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
 #ifdef __HLS_SYN__
     bool initialized = false;
     typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t index_table[CONFIG_T::table_size];
 #else
     static bool initialized = false;
     static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t index_table[CONFIG_T::table_size];
 #endif
     if (!initialized) {
-        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
+        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(index_table, invert_sqr_table);
         initialized = true;
     }
 
@@ -70,6 +103,7 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
     typename CONFIG_T::mean_t sum_cache2 = 0;
     typename CONFIG_T::mean_t var, mean, diff;
     typename CONFIG_T::mean_t data_diff[dim];
+    typename CONFIG_T::mean_t var_epsilon = (typename CONFIG_T::mean_t)CONFIG_T::epsilon;
 
     #pragma HLS ARRAY_PARTITION variable=data_diff complete
 
@@ -85,16 +119,7 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
         sum_cache2 += diff;
     }
     var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
-
-    int index = var * (CONFIG_T::table_size) * inv_range_inv;
-    if (CONFIG_T::table_range > 1)
-        index = var * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
-
-    if (index < 1) // Avoid division by zero
-        index = 1;
-    if (index > CONFIG_T::table_size - 1)
-        index = CONFIG_T::table_size - 1;
-    deno_inver = (typename CONFIG_T::table_t)invert_sqr_table[index];
+    lookup_invert_sqr<CONFIG_T>(var + var_epsilon, deno_inver, index_table, invert_sqr_table);
 
     for (int i = 0; i < dim; ++i) {
         res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
index cd6d86b57c..1deebe5135 100644
--- a/test/pytest/test_layernorm.py
+++ b/test/pytest/test_layernorm.py
@@ -11,13 +11,13 @@
 test_root_path = Path(__file__).parent
 
 in_shape = (4, 5)
-atol = 5e-3
+atol = 1e-2
 
 
 @pytest.fixture(scope='module')
 def data():
     np.random.seed(0)
-    X = np.random.rand(1, *in_shape)
+    X = np.random.rand(100, *in_shape)
     return X
 
 
@@ -43,5 +43,5 @@ def test_layernorm(model, data):
 
     # Predict
     y_keras = model.predict(data).flatten()
-    y_hls = hls_model.predict(data)
+    y_hls = hls_model.predict(data).flatten()
     np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)

From 17e0048fcaaf03cd59d2618c61b86873962282b1 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Thu, 19 Sep 2024 15:47:54 +0800
Subject: [PATCH 41/55] layernorm on pytorch also

---
 hls4ml/converters/keras/core.py               | 15 ++----
 hls4ml/converters/pytorch/core.py             | 29 ++++++++++++
 .../passes/convert_to_channels_last.py        | 27 ++++++++++-
 test/pytest/test_layernorm.py                 |  4 +-
 test/pytest/test_layernorm_pytorch.py         | 46 +++++++++++++++++++
 5 files changed, 107 insertions(+), 14 deletions(-)
 create mode 100644 test/pytest/test_layernorm_pytorch.py

diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 3a55f70231..4ccf119cb3 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -133,19 +133,14 @@ def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader):
     in_size = 1
     for dim in input_shapes[0][1:]:
         in_size *= dim
-
-    layer['axis'] = keras_layer['config']['axis'] if (keras_layer['config']['axis'][0] == 2) else False
-    if layer['axis'] is False:
-        raise Exception('assigning the axis is not currently supported by hls4ml, only axis 2 is supported')
+    layer['n_in'] = layer['n_out'] = in_size
 
     if not ((len(input_shapes[0])) == 3):
         raise Exception('input size is not currently supported by hls4ml, only dim3 is supported')
-    if len(input_shapes[0]) == 3:
-        layer['seq_len'] = input_shapes[0][-2]
-    else:
-        layer['seq_len'] = 1
-    layer['n_in'] = in_size
-    layer['n_out'] = layer['n_in']
+    layer['seq_len'] = input_shapes[0][-2]
+
+    if not (keras_layer['config']['axis'][0] == 2):
+        raise Exception('assigning the axis is not currently supported by hls4ml, only axis 2 is supported')
 
     layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
     layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py
index 89f330ca7f..d0f415d643 100644
--- a/hls4ml/converters/pytorch/core.py
+++ b/hls4ml/converters/pytorch/core.py
@@ -133,3 +133,32 @@ def parse_batchnorm_layer(operation, layer_name, input_names, input_shapes, node
         layer['n_filt'] = input_shapes[0][1]  # Always channel first for Pytorch
 
     return layer, [shape for shape in input_shapes[0]]
+
+
+@pytorch_handler('LayerNorm')
+def parse_layernorm_layer(operation, layer_name, input_names, input_shapes, node, class_object, data_reader, config):
+    assert 'LayerNorm' in operation
+
+    layer = {}
+
+    layer['class_name'] = 'LayerNormalization'
+    layer['name'] = layer_name
+    layer['inputs'] = input_names
+
+    in_size = 1
+    for dim in input_shapes[0][1:]:
+        in_size *= dim
+    layer['n_in'] = layer['n_out'] = in_size
+
+    if not ((len(input_shapes[0])) == 3):
+        raise Exception('input size is not currently supported by hls4ml, only dim3 is supported')
+    layer['seq_len'] = input_shapes[0][-2]
+
+    layer['gamma_data'] = class_object.weight.data.numpy()
+    layer['beta_data'] = class_object.bias.data.numpy()
+    
+    layer['epsilon'] = class_object.eps
+    if layer['epsilon'] <= 0:
+        raise Exception('epsilon must be positive')
+
+    return layer, [shape for shape in input_shapes[0]]
diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index a3b861ddfe..cbd9cf2a44 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -2,7 +2,7 @@
 # Based on https://github.com/fastmachinelearning/qonnx/blob/
 # 12c96a3ded06beacab08e0f554e4ed014476c0aa/src/qonnx/transformation/channels_last.py
 
-from hls4ml.model.layers import Concatenate, Dense, Input, Reshape, Transpose
+from hls4ml.model.layers import Concatenate, Dense, Input, LayerNormalization, Reshape, Transpose
 from hls4ml.model.optimizer import OptimizerPass
 from hls4ml.model.types import WeightVariable
 
@@ -45,6 +45,31 @@ def transform(self, model, node):
                 node.get_output_variable().shape = input_shape
                 dim_names = [f'N_INPUT_{i}_{node.index}' for i in range(1, len(input_shape) + 1)]
                 node.get_output_variable().dim_names = dim_names
+        elif isinstance(node, LayerNormalization):
+            # LayerNorm only works on the last dimension in PyTorch
+            perm = [1, 0]
+            pre_transpose = model.make_node(
+                'Transpose',
+                f'pre_transpose_for_{node.get_attr('name')}',
+                {'perm': perm},
+                [node.get_input_node().name]
+            )
+            pre_transpose.channels_last_converted = True
+            model.insert_node(pre_transpose)
+
+            # If not the output layer, transpose again
+            if not (
+                node.get_attr('name') in model.outputs
+                and model.config.config['HLSConfig']['Model']['TransposeOutputs']
+            ):
+                post_transpose = model.make_node(
+                    'Transpose',
+                    f'post_transpose_for_{node.get_attr('name')}',
+                    {'perm': perm},
+                    [node.name]
+                )
+                post_transpose.channels_last_converted = True
+                model.insert_node(post_transpose)
         else:
             # Transpose weight tensors
             tensors = ['weight', 'depthwise', 'pointwise', 'zero_bias', 'scale', 'recurrent_weight']
diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
index 1deebe5135..820338a533 100644
--- a/test/pytest/test_layernorm.py
+++ b/test/pytest/test_layernorm.py
@@ -24,10 +24,8 @@ def data():
 @pytest.fixture(scope='module')
 def model():
     model = Sequential()
-    model.add(Input(shape=(*in_shape,)))
-    model.add(LayerNormalization(dtype=tf.float32))
+    model.add(LayerNormalization(input_shape=in_shape))
     model.compile()
-    model.layers[0].set_weights([np.ones((in_shape[1],)), np.zeros((in_shape[1],))])
     return model
 
 # Currently only Vivado in io_parallel mode is supported
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
new file mode 100644
index 0000000000..471b2fb99e
--- /dev/null
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -0,0 +1,46 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+from torch import nn
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+in_shape = (4, 5)
+atol = 1e-2
+
+
+@pytest.fixture(scope='module')
+def data():
+    np.random.seed(0)
+    X = np.random.rand(100, *in_shape)
+    print("X: ", X)
+    return X
+
+
+@pytest.fixture(scope='module')
+def model():
+    model = nn.Sequential(
+        nn.LayerNorm(in_shape[-1]),
+    ).to()
+    model.eval()
+    return model
+
+# Currently only Vivado in io_parallel mode is supported
+def test_layernorm(model, data):
+    config = hls4ml.utils.config_from_pytorch_model(
+        model, in_shape, granularity='name', backend='Vivado'
+    )
+    output_dir = str(test_root_path / f'hls4mlprj_layernorm_pytorch_Vivado_io_parallel')
+    hls_model = hls4ml.converters.convert_from_pytorch_model(
+        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
+    )
+    hls_model.compile()
+
+    # Predict
+    y_pytorch = model(torch.Tensor(data)).detach().numpy().flatten()
+    y_hls = hls_model.predict(data).flatten()
+    np.testing.assert_allclose(y_pytorch, y_hls, rtol=0, atol=atol, verbose=True)

From 63891fddd751bd0fabc31626f513e735d8ca4add Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Thu, 19 Sep 2024 15:56:19 +0800
Subject: [PATCH 42/55] minor cleanup

---
 hls4ml/model/optimizer/passes/convert_to_channels_last.py | 4 ++--
 test/pytest/test_layernorm_pytorch.py                     | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index cbd9cf2a44..ad59f56245 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -50,7 +50,7 @@ def transform(self, model, node):
             perm = [1, 0]
             pre_transpose = model.make_node(
                 'Transpose',
-                f'pre_transpose_for_{node.get_attr('name')}',
+                f'pre_transpose_for_{node.get_attr("name")}',
                 {'perm': perm},
                 [node.get_input_node().name]
             )
@@ -64,7 +64,7 @@ def transform(self, model, node):
             ):
                 post_transpose = model.make_node(
                     'Transpose',
-                    f'post_transpose_for_{node.get_attr('name')}',
+                    f'post_transpose_for_{node.get_attr("name")}',
                     {'perm': perm},
                     [node.name]
                 )
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
index 471b2fb99e..24bbc6c32a 100644
--- a/test/pytest/test_layernorm_pytorch.py
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -17,7 +17,6 @@
 def data():
     np.random.seed(0)
     X = np.random.rand(100, *in_shape)
-    print("X: ", X)
     return X
 
 

From 8dccac6ff7b3393f76f1317d6b893abc123d93ef Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Thu, 19 Sep 2024 16:36:47 +0800
Subject: [PATCH 43/55] more cleanup, pre-commit

---
 hls4ml/backends/vivado/passes/resource_strategy.py | 14 ++------------
 hls4ml/converters/pytorch/core.py                  |  2 +-
 .../optimizer/passes/convert_to_channels_last.py   | 13 +++----------
 .../templates/vivado/nnet_utils/nnet_layernorm.h   | 14 ++++++--------
 test/pytest/test_layernorm.py                      | 10 ++++------
 test/pytest/test_layernorm_pytorch.py              |  7 +++----
 6 files changed, 19 insertions(+), 41 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/resource_strategy.py b/hls4ml/backends/vivado/passes/resource_strategy.py
index 4ed028fa32..63e6e0b4db 100644
--- a/hls4ml/backends/vivado/passes/resource_strategy.py
+++ b/hls4ml/backends/vivado/passes/resource_strategy.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, MultiHeadAttention, SeparableConv1D, SeparableConv2D
+from hls4ml.model.layers import GRU, LSTM, Conv1D, Conv2D, Dense, SeparableConv1D, SeparableConv2D
 from hls4ml.model.optimizer import OptimizerPass
 
 
@@ -8,10 +8,7 @@ class ApplyResourceStrategy(OptimizerPass):
     '''Transposes the weights to use the dense_resource matrix multiply routine'''
 
     def match(self, node):
-
-        node_matches = isinstance(
-            node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU, MultiHeadAttention)
-        )
+        node_matches = isinstance(node, (Dense, Conv1D, SeparableConv1D, Conv2D, SeparableConv2D, LSTM, GRU))
         is_resource_strategy = node.get_attr('strategy', '').lower() == 'resource'
         already_transformed = node.get_attr('_weights_transposed', False) is True
 
@@ -43,13 +40,6 @@ def transform(self, model, node):
         elif isinstance(node, (LSTM, GRU)):
             node.weights['weight'].data = np.transpose(node.weights['weight'].data)
             node.weights['recurrent_weight'].data = np.transpose(node.weights['recurrent_weight'].data)
-        elif isinstance(node, (MultiHeadAttention)):
-            # node.weights['key_weight'].data   = np.transpose(node.weights['key_weight'].data,   axes=[0, 2, 1])
-            # node.weights['query_weight'].data = np.transpose(node.weights['query_weight'].data, axes=[0, 2, 1])
-            # node.weights['value_weight'].data = np.transpose(node.weights['value_weight'].data, axes=[0, 2, 1])
-            # node.weights['attention_output_weight'].data =
-            #     np.transpose(node.weights['attention_output_weight'].data, axes=[2, 0, 1])
-            print("not transpose")
         else:
             raise Exception(f'Unexpected layer {node.class_name} with resource strategy')
 
diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py
index d0f415d643..2eaa7a76f7 100644
--- a/hls4ml/converters/pytorch/core.py
+++ b/hls4ml/converters/pytorch/core.py
@@ -156,7 +156,7 @@ def parse_layernorm_layer(operation, layer_name, input_names, input_shapes, node
 
     layer['gamma_data'] = class_object.weight.data.numpy()
     layer['beta_data'] = class_object.bias.data.numpy()
-    
+
     layer['epsilon'] = class_object.eps
     if layer['epsilon'] <= 0:
         raise Exception('epsilon must be positive')
diff --git a/hls4ml/model/optimizer/passes/convert_to_channels_last.py b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
index ad59f56245..ecfa636d42 100644
--- a/hls4ml/model/optimizer/passes/convert_to_channels_last.py
+++ b/hls4ml/model/optimizer/passes/convert_to_channels_last.py
@@ -49,24 +49,17 @@ def transform(self, model, node):
             # LayerNorm only works on the last dimension in PyTorch
             perm = [1, 0]
             pre_transpose = model.make_node(
-                'Transpose',
-                f'pre_transpose_for_{node.get_attr("name")}',
-                {'perm': perm},
-                [node.get_input_node().name]
+                'Transpose', f'pre_transpose_for_{node.get_attr("name")}', {'perm': perm}, [node.get_input_node().name]
             )
             pre_transpose.channels_last_converted = True
             model.insert_node(pre_transpose)
 
             # If not the output layer, transpose again
             if not (
-                node.get_attr('name') in model.outputs
-                and model.config.config['HLSConfig']['Model']['TransposeOutputs']
+                node.get_attr('name') in model.outputs and model.config.config['HLSConfig']['Model']['TransposeOutputs']
             ):
                 post_transpose = model.make_node(
-                    'Transpose',
-                    f'post_transpose_for_{node.get_attr("name")}',
-                    {'perm': perm},
-                    [node.name]
+                    'Transpose', f'post_transpose_for_{node.get_attr("name")}', {'perm': perm}, [node.name]
                 )
                 post_transpose.channels_last_converted = True
                 model.insert_node(post_transpose)
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index fe142c6ce3..7471ed2a74 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -29,9 +29,8 @@ struct layernorm_config {
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
 
-template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(
-    typename CONFIG_T::table_t table_in[N_TABLE],
-    typename CONFIG_T::table_t table_out[N_TABLE]) {
+template <typename CONFIG_T, int N_TABLE>
+void init_invert_sqr_table(typename CONFIG_T::table_t table_in[N_TABLE], typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Inversion function:
     //   result = 1/sqrt(x)
     // Use log spacing to get more precision at lower values
@@ -47,11 +46,10 @@ template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(
     }
 }
 
-template <typename CONFIG_T> void lookup_invert_sqr(
-    typename CONFIG_T::mean_t x,
-    typename CONFIG_T::table_t &res,
-    typename CONFIG_T::table_t table_in[CONFIG_T::table_size],
-    typename CONFIG_T::table_t table_out[CONFIG_T::table_size]) {
+template <typename CONFIG_T>
+void lookup_invert_sqr(typename CONFIG_T::mean_t x, typename CONFIG_T::table_t &res,
+                       typename CONFIG_T::table_t table_in[CONFIG_T::table_size],
+                       typename CONFIG_T::table_t table_out[CONFIG_T::table_size]) {
     if (x <= table_in[0]) {
         res = table_out[0];
         return;
diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
index 820338a533..7b08df9369 100644
--- a/test/pytest/test_layernorm.py
+++ b/test/pytest/test_layernorm.py
@@ -2,8 +2,7 @@
 
 import numpy as np
 import pytest
-import tensorflow as tf
-from tensorflow.keras.layers import Input, LayerNormalization
+from tensorflow.keras.layers import LayerNormalization
 from tensorflow.keras.models import Sequential
 
 import hls4ml
@@ -28,12 +27,11 @@ def model():
     model.compile()
     return model
 
+
 # Currently only Vivado in io_parallel mode is supported
 def test_layernorm(model, data):
-    config = hls4ml.utils.config_from_keras_model(
-        model, granularity='name', backend='Vivado'
-    )
-    output_dir = str(test_root_path / f'hls4mlprj_layernorm_Vivado_io_parallel')
+    config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado')
+    output_dir = str(test_root_path / 'hls4mlprj_layernorm_Vivado_io_parallel')
     hls_model = hls4ml.converters.convert_from_keras_model(
         model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
     )
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
index 24bbc6c32a..a38066ec8c 100644
--- a/test/pytest/test_layernorm_pytorch.py
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -28,12 +28,11 @@ def model():
     model.eval()
     return model
 
+
 # Currently only Vivado in io_parallel mode is supported
 def test_layernorm(model, data):
-    config = hls4ml.utils.config_from_pytorch_model(
-        model, in_shape, granularity='name', backend='Vivado'
-    )
-    output_dir = str(test_root_path / f'hls4mlprj_layernorm_pytorch_Vivado_io_parallel')
+    config = hls4ml.utils.config_from_pytorch_model(model, in_shape, granularity='name', backend='Vivado')
+    output_dir = str(test_root_path / 'hls4mlprj_layernorm_pytorch_Vivado_io_parallel')
     hls_model = hls4ml.converters.convert_from_pytorch_model(
         model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
     )

From 595cc71731038894fa13165d7ee72742d02e8542 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Fri, 20 Sep 2024 00:14:44 +0800
Subject: [PATCH 44/55] test for mha which kinda works maybe if you squint

---
 hls4ml/backends/fpga/fpga_backend.py          |  2 +
 .../vivado/passes/transformer_templates.py    |  1 -
 hls4ml/backends/vivado/vivado_backend.py      |  8 +-
 hls4ml/converters/keras/multiheadattention.py | 36 +++++--
 hls4ml/converters/keras/qkeras.py             |  1 -
 hls4ml/converters/keras_to_hls.py             |  5 +-
 hls4ml/model/graph.py                         |  4 +-
 hls4ml/model/layers.py                        | 45 ++++-----
 .../model/optimizer/passes/infer_precision.py | 60 +++++++++++-
 .../vivado/nnet_utils/nnet_activation.h       | 37 ++++----
 .../templates/vivado/nnet_utils/nnet_dense.h  |  6 --
 .../vivado/nnet_utils/nnet_layernorm.h        |  2 +-
 .../nnet_utils/nnet_multiheadattention.h      | 93 ++++++-------------
 test/pytest/test_layernorm.py                 |  3 +-
 test/pytest/test_layernorm_pytorch.py         |  3 +-
 test/pytest/test_multiheadattention.py        | 55 +++++++++++
 16 files changed, 214 insertions(+), 147 deletions(-)
 create mode 100644 test/pytest/test_multiheadattention.py

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index 479af8ebf3..e541e11ee4 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -22,6 +22,7 @@
     GarNetStack,
     GlobalPooling1D,
     GlobalPooling2D,
+    MultiHeadAttention,
     Pooling1D,
     Pooling2D,
     SeparableConv1D,
@@ -63,6 +64,7 @@ def __init__(self, name):
             LSTM,
             GRU,
             Dot,
+            MultiHeadAttention,
         ]
 
         for layer in accum_layers:
diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index f1f4918cca..fafc10859e 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -70,7 +70,6 @@ def __init__(self):
         self.activ1_template = softmax_config_template
 
     def format(self, node):
-
         params = self._default_config_params(node)
         params['num_heads'] = node.get_attr('num_heads')
         params['head_dim_key'] = node.get_attr('head_dim_key')
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index a680dce8cb..da5fc158fb 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -506,13 +506,9 @@ def init_softmax(self, layer):
         if 'exp_range' not in layer.attributes:
             layer.set_attr('exp_range', 8)
         if layer.model.config.is_resource_strategy(layer):
-            # 'resource' strategy = 'latency' for Softmax
-            # layer.set_attr('implementation', 'latency')
-            layer.set_attr('implementation', 'legacy')  # latency legacy stable
-
+            layer.set_attr('implementation', 'latency')
         else:
-            # layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
-            layer.set_attr('implementation', 'legacy')  # latency legacy stable
+            layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
 
         if layer.model.config.get_config_value('IOType') == 'io_parallel':
             assert (
diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
index 6af8114006..c295236561 100644
--- a/hls4ml/converters/keras/multiheadattention.py
+++ b/hls4ml/converters/keras/multiheadattention.py
@@ -1,8 +1,8 @@
-from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
+from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer
 
 
 @keras_handler('MultiHeadAttention')
-def parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
+def parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader):
     # assume input_shapes is: [[None, seq, dim]]
     assert 'MultiHeadAttention' in keras_layer['class_name']
     assert input_shapes[0] == keras_layer['config']['query_shape']
@@ -19,22 +19,42 @@ def parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_r
     layer['seq_len'] = layer['query_shape'][-2]
 
     if keras_layer['config']['output_shape']:
-        # output_shape = keras_layer['config']['output_shape']
-        # output_shape = (layer['query_shape'][:2]).extend(out_shape)
         raise Exception('hls4ml does not support a defined output shape, the output shape must equal to the query shape')
-    else:  # by default output shape in config is False, and thus select the output shape equal query shape
+    else:
         output_shape = layer['query_shape']
 
     layer['attention_axes'] = (
         keras_layer['config']['attention_axes'] if (keras_layer['config']['attention_axes'][0] == 1) else False
     )
     if layer['attention_axes'] is False:
-        raise Exception('assigning the attention_axe is not currently supported by hls4ml')
+        raise Exception('assigning the attention_axes is not currently supported by hls4ml')
 
-    if not ((len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3 and (len(layer['query_shape'])) == 3):
-        raise Exception('muti-dimension of feature dim is not currently supported by hls4ml')
+    if not (len(layer['query_shape']) == 3 and len(layer['key_shape']) == 3 and len(layer['value_shape']) == 3):
+        raise Exception('only 3D shapes for query, key, and value are currently supported by hls4ml')
 
     attn_scores_rank = 4
     layer['softmax_axis'] = list(range(attn_scores_rank - len(layer['attention_axes']), attn_scores_rank))
 
+    weights_sources = [
+        ('attention_output', 'kernel'),
+        ('attention_output', 'bias'),
+        ('key', 'kernel'),
+        ('key', 'bias'),
+        ('query', 'kernel'),
+        ('query', 'bias'),
+        ('value', 'kernel'),
+        ('value', 'bias'),
+    ]
+
+    for lname, wtype in weights_sources:
+        data = get_weights_data(data_reader, layer['name'], f'{lname}/{wtype}')
+        if wtype == 'kernel':
+            vtype = 'weight'
+            if lname in ['key', 'query', 'value']:
+                data = data.transpose((1, 0, 2))
+        else:
+            vtype = 'bias'
+
+        layer[f'{lname}_{vtype}_data'] = data
+
     return layer, output_shape
diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py
index d101f7972f..f184b8d540 100644
--- a/hls4ml/converters/keras/qkeras.py
+++ b/hls4ml/converters/keras/qkeras.py
@@ -160,7 +160,6 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
         layer['class_name'] = 'Activation'
         layer['activation'] = activation_config['class_name'].replace('quantized_', '')
     if activation_config['class_name'] == 'quantized_softmax':
-        # activation_config['class_name'] = 'softmax'
         layer['class_name'] = 'Softmax'
         layer['axis'] = keras_layer['config'].get('axis', -1)
     layer['activation_quantizer'] = activation_config
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index b652b5b724..cde9d912ee 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -284,9 +284,8 @@ def parse_keras_model(model_arch, reader):
         # Extract inbound nodes
         if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
             input_names = [inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0]]
-            if keras_layer['inbound_nodes'][0][0][
-                -1
-            ]:  # multi_head_attention  has inbound: [[['input_3', 0, 0, {'value': ['dense_3', 0, 0]}]]]
+            if keras_layer['inbound_nodes'][0][0][-1]:
+                # multi_head_attention  has inbound: [[['input_3', 0, 0, {'value': ['dense_3', 0, 0]}]]]
                 inputname2 = list(keras_layer['inbound_nodes'][0][0][-1].values())
                 input_names += [inp[0] for inp in inputname2]
         else:
diff --git a/hls4ml/model/graph.py b/hls4ml/model/graph.py
index b8d84228b3..d29d9afa02 100644
--- a/hls4ml/model/graph.py
+++ b/hls4ml/model/graph.py
@@ -588,9 +588,7 @@ def remove_node(self, node, rewire=True):
                                 next_node.inputs[i] = prev_node.outputs[0]
                                 break
                 else:
-                    # if not node.outputs[0] in self.outputs:
-                    # would this be the key output_vars? because the self.outputs is the model final output
-                    if not node.outputs[0] in self.output_vars.keys():  # my change
+                    if not node.outputs[0] in self.output_vars.keys():
                         raise Exception('Cannot rewire a node without child')
             else:
                 raise Exception('Cannot rewire a node without a parent')
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 5308129748..77678c9ecc 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -948,11 +948,8 @@ def initialize(self):
         scale = self.get_attr('gamma_data')
         bias = self.get_attr('beta_data')
 
-        scale_precision = self.get_attr('scale_t', default=FixedPrecisionType(width=16, integer=6, signed=True))
-        bias_precision = self.get_attr('bias_t', default=FixedPrecisionType(width=16, integer=6, signed=True))
-
-        self.add_weights_variable(name='scale', var_name='s{index}', precision=scale_precision, data=scale)
-        self.add_weights_variable(name='bias', var_name='b{index}', precision=bias_precision, data=bias)
+        self.add_weights_variable(name='scale', var_name='s{index}', data=scale)
+        self.add_weights_variable(name='bias', var_name='b{index}', data=bias)
 
 
 class Merge(Layer):
@@ -1467,32 +1464,22 @@ class MultiHeadAttention(Layer):
     ]
 
     def initialize(self):
-        weights_source = [
-            ('attention_output', 'kernel'),
-            ('attention_output', 'bias'),
-            ('key', 'kernel'),
-            ('key', 'bias'),
-            ('query', 'kernel'),
-            ('query', 'bias'),
-            ('value', 'kernel'),
-            ('value', 'bias'),
+        weights = [
+            'attention_output_weight',
+            'attention_output_bias',
+            'key_weight',
+            'key_bias',
+            'query_weight',
+            'query_bias',
+            'value_weight',
+            'value_bias',
         ]
 
-        for lname, wtype in weights_source:
-            data = self.model.get_weights_data(self.name, f'{lname}/{wtype}')
-            if wtype == 'kernel':
-                vtype = 'weight'
-                if lname in ['key', 'query', 'value']:
-                    data = data.transpose((1, 0, 2))
-                #     data = data.transpose((0, 2, 1)) ###
-                # if lname in ['attention_output']:
-                #     data = data.transpose((2,0,1)) ###
-            else:
-                vtype = 'bias'
-
-            name = f'{lname}_{vtype}'
-            var_name = f'{lname}_{vtype}{{index}}'
-            self.add_weights_variable(name=name, var_name=var_name, data=data)
+        for w in weights:
+            data_name = f'{w}_data'
+            var_name = f'{w}{{index}}'
+            data = self.get_attr(data_name)
+            self.add_weights_variable(name=w, var_name=var_name, data=data)
 
         shape = self.attributes['query_shape'][1:]
         dims = [f'seq_out_{self.index}', f'feature_out_{self.index}']
diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index bb24f2206e..80ad3d7437 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -51,7 +51,7 @@ def _infer_precision(self, node, types_to_infer):
         if node_class in ['Dense']:
             return self._infer_dense_precision(node, types_to_infer)
 
-        if node_class in ['BatchNormalization', 'ApplyAlpha']:
+        if node_class in ['BatchNormalization', 'ApplyAlpha', 'LayerNormalization']:
             return self._infer_bn_precision(node, types_to_infer)
 
         if node_class in ['Conv1D', 'Conv2D', 'PointwiseConv1D', 'PointwiseConv2D', 'Conv2DBatchnorm']:
@@ -84,6 +84,9 @@ def _infer_precision(self, node, types_to_infer):
         if node_class in ['SimpleRNN', 'LSTM', 'GRU']:
             return self._infer_rnn_precision(node, types_to_infer)
 
+        if node_class in ['MultiHeadAttention']:
+            return self._infer_mha_precision(node, types_to_infer)
+
         # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent
         # this in config_from_* functions
 
@@ -557,3 +560,58 @@ def _infer_rnn_precision(self, node, types_to_infer):
                 inferred_types.append(f'{weightvar}_t')
 
         return inferred_types
+
+    def _infer_mha_precision(self, node, types_to_infer):
+        inferred_types = []
+
+        for weightvar in (
+            'attention_output_weight',
+            'attention_output_bias',
+            'key_weight',
+            'key_bias',
+            'query_weight',
+            'query_bias',
+            'value_weight',
+            'value_bias',
+        ):
+            if f'{weightvar}_t' in types_to_infer:
+                self._infer_default_type(node, f'{weightvar}_t')
+                node.weights[weightvar].update_precision(node.types[f'{weightvar}_t'].precision)
+                inferred_types.append(f'{weightvar}_t')
+
+        if 'result_t' in types_to_infer:
+            input_precision = node.get_input_variable().type.precision
+            weight_precision = node.types['attention_output_weight_t'].precision
+            bias_precision = node.types['attention_output_bias_t'].precision
+
+            if self._all_supported_types((input_precision, weight_precision, bias_precision)):
+
+                after_weight_width = input_precision.width + weight_precision.width
+                after_weight_integer = input_precision.integer + weight_precision.integer
+                after_weight_signed = input_precision.signed or weight_precision.signed
+
+                out_signed = after_weight_signed or bias_precision.signed
+                out_integer = (
+                    max(
+                        after_weight_integer + (bias_precision.signed and not after_weight_signed),
+                        bias_precision.integer + (after_weight_signed and not bias_precision.signed),
+                    )
+                    + 1
+                )
+                out_width = out_integer + max(after_weight_width - after_weight_integer, bias_precision.fractional)
+
+                # Apply max precision constraints if specified in model config
+                max_precision = self._get_maximum_precision(node)
+                if max_precision is not None:
+                    out_width = min(out_width, max_precision.width)
+                    out_integer = min(out_integer, max_precision.integer)
+
+                out_precision = FixedPrecisionType(out_width, out_integer, out_signed)
+            else:
+                out_precision = self._get_default_precision(node)
+
+            node.types['result_t'].name = f'{node.name}_result_t'
+            node.types['result_t'].precision = out_precision
+            inferred_types.append('result_t')
+
+        return inferred_types
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index 6b4e0d4b91..c3e550460c 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -322,10 +322,6 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     int data_round;
     int index;
 
-    //    std::cout << "input to SM: " << std::endl;              /////
-    //    nnet::print_result<data_T, CONFIG_T::n_in>(data, std::cout);  /////
-    //    std::cout << " " << std::endl;   /////
-
     #pragma HLS array_partition variable=data_cache complete
 
     typename CONFIG_T::accum_t denominator;
@@ -334,41 +330,48 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     denominator = 0;
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         data_round = data[ii] * (CONFIG_T::table_size / (exp_range * 2));
-        // std::cout << " data, round: " << data[ii] << " " << data_round << std::endl;  /////
         index = data_round + exp_range * (CONFIG_T::table_size / (exp_range * 2));
-        // std::cout << " index: " << index;   /////
         if (index < 0)
             index = 0;
         if (index > CONFIG_T::table_size - 1)
             index = CONFIG_T::table_size - 1;
         denominator += exp_table[index];
-        // std::cout << "   denominator " << index << std::endl;   /////
-        // std::cout << "   denominator " << denominator << std::endl;   /////
         data_cache[ii] = exp_table[index];
     }
-    // std::cout << "end  " << std::endl;    /////
 
     // using lookup table for inverse
     int exp_res_index = denominator * (CONFIG_T::table_size / inv_range);
 
-    // std::cout << " denominator: " << denominator << std::endl;  /////
-    // std::cout << " table_size: " << CONFIG_T::table_size << std::endl;  /////
-    // std::cout << " inv_range: " << inv_range << std::endl;  /////
-    // std::cout << " exp_res_index: " << exp_res_index << std::endl;  /////
     if (exp_res_index < 0)
         exp_res_index = 0;
     if (exp_res_index > CONFIG_T::table_size - 1)
         exp_res_index = CONFIG_T::table_size - 1;
     deno_inver = invert_table[exp_res_index];
-    // std::cout << " deno_inver: " << deno_inver << std::endl;  /////
 
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
         res[ii] = (res_T)(data_cache[ii] * deno_inver);
     }
+}
+
+template <class data_T, class res_T, typename CONFIG_T>
+void softmax_argmax(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
+    for (int i = 0; i < CONFIG_T::n_in; i++) {
+        #pragma HLS UNROLL
+        res[i] = (res_T)0;
+    }
+
+    data_T maximum = data[0];
+    int idx = 0;
+
+    for (int i = 1; i < CONFIG_T::n_in; i++) {
+        #pragma HLS PIPELINE
+        if (data[i] > maximum) {
+            maximum = data[i];
+            idx = i;
+        }
+    }
 
-    //	std::cout << "out SM: " << std::endl;
-    //    nnet::print_result<result_t, CONFIG_T::n_in>(res, std::cout);
-    //    std::cout << " " << std::endl;
+    res[idx] = (res_T)1;
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
index 19cd5a63bb..61975bb3c4 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
@@ -47,12 +47,6 @@ void dense(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::
             dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
         }
     }
-
-    //    std::cout << "out Dense: " << std::endl;
-    //    for(int i=0; i < CONFIG_T::n_out*CONFIG_T::seq_len; ++i) {
-    //        std::cout << res[i] << " ";
-    //    }
-    //    std::cout << std::endl;
 }
 
 } // namespace nnet
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index 7471ed2a74..6cab828178 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -130,7 +130,7 @@ void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
                     typename CONFIG_T::bias_t bias[CONFIG_T::n_in / CONFIG_T::seq_len]) {
     static const unsigned dim = CONFIG_T::n_in / CONFIG_T::seq_len;
     data_T in_val[dim];
-    data_T outval[dim];
+    res_T outval[dim];
     // Use a function_instantiate in case it helps to explicitly optimize unchanging weights/biases
     #pragma HLS function_instantiate variable=scale,bias
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
index 72d90dfd0a..3543b0d007 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
@@ -34,12 +34,7 @@ struct multiheadattention_config {
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
 
-
 template <int PackSize, class data_T> struct datapack { data_T data[PackSize]; };
-template<int PackSize, class data_T>
-struct datapack {
-    data_T data[PackSize];
-};
 
 template <class data_T, int size> void read_stream_array(hls::stream<data_T> data_in[size], data_T out[size]) {
     for (int k = 0; k < size; ++k) {
@@ -48,39 +43,17 @@ template <class data_T, int size> void read_stream_array(hls::stream<data_T> dat
     }
 }
 
-//////////////////
-//Dennis version//
-//////////////////
-//template<int PackSize, class data_T>
-//struct datapack {
-//    typename CONFIG_T::multi_t data[PackSize];
-//};
-//
-//template <class data_T,int size>
-//void read_stream_array(
-//	hls::stream<data_T>    data_in[size],
-//	typename CONFIG_T::multi_t out[size]
-//)
-//{
-//	for (int k=0; k<size; ++k){
-//	#pragma HLS UNROLL
-//		out[k] = data_in[k].read();
-//	}
-//}
-
-
-template<class data_T, class res_T, typename CONFIG_T>
-void matrixmul_transpose(
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
-	hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
-    res_T  QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
+template <class data_T, class res_T, typename CONFIG_T>
+void matrixmul_transpose(hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
+                         hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
+                         res_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
 {
     const data_T dk = 1.0 / sqrt(CONFIG_T::head_dim_key);
     data_T QK_1;
     typename CONFIG_T::accum_t QKij;
     data_T Qi[CONFIG_T::head_dim_key];
     data_T Product[CONFIG_T::seq_len]; // seq_Q, seq_K
-    data_T qk_smout[CONFIG_T::seq_len];
+    res_T qk_smout[CONFIG_T::seq_len];
     data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
     #pragma HLS ARRAY_PARTITION variable=Qi complete
     #pragma HLS ARRAY_PARTITION variable=Product complete
@@ -94,8 +67,8 @@ void matrixmul_transpose(
     #pragma HLS DATA_PACK variable=datak_pack
     #pragma HLS DATA_PACK variable=dataq_pack
 
-    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+    // int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
+    // CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
 
 prep_k:
     for (int i = 0; i < CONFIG_T::seq_len; ++i) {
@@ -107,7 +80,6 @@ void matrixmul_transpose(
         }
     }
 
-// for each row and column of AB
 row:
     for (int i = 0; i < CONFIG_T::seq_len; ++i) {
         #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
@@ -120,7 +92,6 @@ void matrixmul_transpose(
         }
     col:
         for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-            // compute (QK)i,j
             QKij = 0;
         product:
             for (int k = 0; k < CONFIG_T::head_dim_key; ++k) {
@@ -137,10 +108,9 @@ void matrixmul_transpose(
     }
 }
 
-/////////
 template <class data_T, class res_T, typename CONFIG_T>
 void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &V,
-               hls::stream<data_T> S[CONFIG_T::head_dim_value]) // S: attention score
+               hls::stream<res_T> S[CONFIG_T::head_dim_value]) // S: attention score
 {
     #pragma HLS DATA_PACK variable=V
     #pragma HLS ARRAY_PARTITION variable=QK complete dim=2
@@ -149,8 +119,8 @@ void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<data
     datapack<CONFIG_T::head_dim_key, data_T> datav_pack;
     #pragma HLS DATA_PACK variable=datav_pack
 
-    int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
-    CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
+    // int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
+    // CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
 
     data_T dataV[CONFIG_T::seq_len * CONFIG_T::head_dim_value];
     #pragma HLS ARRAY_PARTITION variable = dataV complete dim = 1
@@ -164,10 +134,9 @@ void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<data
         }
     }
 
-    // for each row and column of AB
     data_T Sij, S_1;
     data_T QKi[CONFIG_T::seq_len];
-    #pragma HLS ARRAY_Partition variable=QKi complete
+#pragma HLS ARRAY_Partition variable=QKi complete
 row:
     for (int i = 0; i < CONFIG_T::seq_len; ++i) {
     #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
@@ -178,7 +147,6 @@ void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<data
         }
     col:
         for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-            // compute (S)i,j
             Sij = 0;
         product:
             for (int k = 0; k < CONFIG_T::seq_len; ++k) {
@@ -192,17 +160,15 @@ void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<data
 
 template <class data_T, class res_T, typename CONFIG_T>
 void lin_projection(hls::stream<data_T> data_q[CONFIG_T::feature_dim], hls::stream<data_T> data_vk[CONFIG_T::feature_dim],
-                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &k_proj,
-                    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &q_proj,
-                    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> &v_proj,
+                    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> &k_proj,
+                    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> &q_proj,
+                    hls::stream<datapack<CONFIG_T::head_dim_value, res_T>> &v_proj,
                     typename CONFIG_T::weight_t key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
                     typename CONFIG_T::bias_t key_bias[CONFIG_T::head_dim_key],
                     typename CONFIG_T::weight_t query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
                     typename CONFIG_T::bias_t query_bias[CONFIG_T::head_dim_key],
                     typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
-                    typename CONFIG_T::bias_t value_bias[CONFIG_T::head_dim_value])
-
-{
+                    typename CONFIG_T::bias_t value_bias[CONFIG_T::head_dim_value]) {
     #pragma HLS DATA_PACK variable=k_proj
     #pragma HLS DATA_PACK variable=q_proj
     #pragma HLS DATA_PACK variable=v_proj
@@ -225,9 +191,9 @@ void lin_projection(hls::stream<data_T> data_q[CONFIG_T::feature_dim], hls::stre
         #pragma HLS ARRAY_PARTITION variable=in_q complete dim=1
         #pragma HLS ARRAY_PARTITION variable=in_v complete dim=1
 
-        datapack<CONFIG_T::head_dim_key, data_T> proj_k_pack;
-        datapack<CONFIG_T::head_dim_key, data_T> proj_q_pack;
-        datapack<CONFIG_T::head_dim_value, data_T> proj_v_pack;
+        datapack<CONFIG_T::head_dim_key, res_T> proj_k_pack;
+        datapack<CONFIG_T::head_dim_key, res_T> proj_q_pack;
+        datapack<CONFIG_T::head_dim_value, res_T> proj_v_pack;
         #pragma HLS DATA_PACK variable=proj_k_pack
         #pragma HLS DATA_PACK variable=proj_q_pack
         #pragma HLS DATA_PACK variable=proj_v_pack
@@ -303,11 +269,11 @@ void multiheadattention(
     typename CONFIG_T::bias_t value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value]) {
     hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
     hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> q_proj[CONFIG_T::num_heads];
-    hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> k_proj[CONFIG_T::num_heads];
-    hls::stream<datapack<CONFIG_T::head_dim_value, data_T>> v_proj[CONFIG_T::num_heads];
-    data_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
-    hls::stream<data_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
+    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> q_proj[CONFIG_T::num_heads];
+    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> k_proj[CONFIG_T::num_heads];
+    hls::stream<datapack<CONFIG_T::head_dim_value, res_T>> v_proj[CONFIG_T::num_heads];
+    res_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
+    hls::stream<res_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
 
     #pragma HLS DATAFLOW
     #pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
@@ -316,9 +282,6 @@ void multiheadattention(
     #pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
     #pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
     #pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
-    // std::cout << "input to MHA: " << std::endl;
-    // nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(data_q, std::cout);
-    // std::cout << " " << std::endl;
 
 prepq:
     for (int i = 0; i < CONFIG_T::num_heads; ++i) {
@@ -331,7 +294,6 @@ void multiheadattention(
         nnet::data_prep<data_T, res_T, CONFIG_T>(data_vk, d_value[i]);
     }
 
-// linear projection
 lin_proj:
     for (int i = 0; i < CONFIG_T::num_heads; ++i) {
         #pragma HLS UNROLL
@@ -346,19 +308,16 @@ void multiheadattention(
 maxtrixmul1:
     for (int i = 0; i < CONFIG_T::num_heads; ++i) {
         #pragma HLS UNROLL
-        nnet::matrixmul_transpose<data_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
+        nnet::matrixmul_transpose<res_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
     }
 
 maxtrixmul2:
     for (int i = 0; i < CONFIG_T::num_heads; ++i) {
         #pragma HLS UNROLL
-        nnet::matrixmul<data_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]); // stream
+        nnet::matrixmul<res_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]); // stream
     }
 
-    nnet::dense_out<data_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
-    //    std::cout << "out MHA: " << std::endl;
-    //    nnet::print_result<result_t, CONFIG_T::seq_len * CONFIG_T::feature_dim>(res, std::cout);
-    //    std::cout << " " << std::endl;
+    nnet::dense_out<res_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
 }
 } // namespace nnet
 
diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
index 7b08df9369..592558188c 100644
--- a/test/pytest/test_layernorm.py
+++ b/test/pytest/test_layernorm.py
@@ -16,8 +16,7 @@
 @pytest.fixture(scope='module')
 def data():
     np.random.seed(0)
-    X = np.random.rand(100, *in_shape)
-    return X
+    return np.random.rand(100, *in_shape)
 
 
 @pytest.fixture(scope='module')
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
index a38066ec8c..87a99a5bf0 100644
--- a/test/pytest/test_layernorm_pytorch.py
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -16,8 +16,7 @@
 @pytest.fixture(scope='module')
 def data():
     np.random.seed(0)
-    X = np.random.rand(100, *in_shape)
-    return X
+    return np.random.rand(100, *in_shape)
 
 
 @pytest.fixture(scope='module')
diff --git a/test/pytest/test_multiheadattention.py b/test/pytest/test_multiheadattention.py
new file mode 100644
index 0000000000..db03788ed3
--- /dev/null
+++ b/test/pytest/test_multiheadattention.py
@@ -0,0 +1,55 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+from tensorflow.keras import Model
+from tensorflow.keras.layers import Input, MultiHeadAttention
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+batch_size = 50
+seq_len_q = 10
+seq_len_kv = 8
+num_heads = 2
+key_dim = 2
+
+atol = 1e-1  # it's pretty bad right now
+
+np.random.seed(0)
+
+
+@pytest.fixture(scope='module')
+def query_data():
+    return np.random.rand(batch_size, seq_len_q, num_heads * key_dim)
+
+
+@pytest.fixture(scope='module')
+def key_value_data():
+    return np.random.rand(batch_size, seq_len_kv, num_heads * key_dim)
+
+
+@pytest.fixture(scope='module')
+def model():
+    query_input = Input(shape=(seq_len_q, num_heads * key_dim), batch_size=batch_size)
+    key_value_input = Input(shape=(seq_len_kv, num_heads * key_dim), batch_size=batch_size)
+    mha_layer = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(query_input, key_value_input)
+    model = Model(inputs=[query_input, key_value_input], outputs=mha_layer)
+    model.compile()
+    return model
+
+
+# Currently only Vivado in io_parallel mode is supported
+def test_multiheadattention(model, query_data, key_value_data):
+    config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado')
+    output_dir = str(test_root_path / 'hls4mlprj_multiheadattention_Vivado_io_parallel')
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
+    )
+    hls_model.compile()
+
+    # Predict
+    y_keras = model.predict([query_data, key_value_data]).flatten()
+    y_hls = hls_model.predict([query_data, key_value_data]).flatten()
+    np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)

From 5f3ec0077f22196b7a309ed2be5c7b582c951e25 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Fri, 20 Sep 2024 22:26:08 +0800
Subject: [PATCH 45/55] multihead attention working on keras and pytorch

---
 .../converters/pytorch/multiheadattention.py  | 54 +++++++++++++++
 test/pytest/test_layernorm_pytorch.py         |  4 +-
 test/pytest/test_multiheadattention.py        | 19 +++---
 .../pytest/test_multiheadattention_pytorch.py | 67 +++++++++++++++++++
 4 files changed, 130 insertions(+), 14 deletions(-)
 create mode 100644 hls4ml/converters/pytorch/multiheadattention.py
 create mode 100644 test/pytest/test_multiheadattention_pytorch.py

diff --git a/hls4ml/converters/pytorch/multiheadattention.py b/hls4ml/converters/pytorch/multiheadattention.py
new file mode 100644
index 0000000000..7c53aeeb54
--- /dev/null
+++ b/hls4ml/converters/pytorch/multiheadattention.py
@@ -0,0 +1,54 @@
+import numpy as np
+
+from hls4ml.converters.pytorch_to_hls import pytorch_handler
+
+
+@pytorch_handler('MultiheadAttention')
+def parse_multiheadattention_layer(
+    operation, layer_name, input_names, input_shapes, node, class_object, data_reader, config
+):
+    assert 'MultiheadAttention' in operation
+    assert len(input_shapes) == 3
+
+    layer = {}
+
+    layer['class_name'] = 'MultiHeadAttention'
+    layer['name'] = layer_name
+    layer['inputs'] = input_names
+
+    layer['num_heads'] = class_object.num_heads
+    layer['head_dim_key'] = class_object.kdim // layer['num_heads']
+    layer['head_dim_value'] = class_object.vdim // layer['num_heads']
+    layer['query_shape'] = input_shapes[0]
+    layer['key_shape'] = input_shapes[1]
+    layer['value_shape'] = input_shapes[2]
+
+    if not (len(layer['query_shape']) == len(layer['key_shape']) == len(layer['value_shape']) == 3):
+        raise Exception('only 3D shapes for query, key, and value are currently supported by hls4ml')
+
+    layer['feature_dim'] = class_object.embed_dim
+    layer['seq_len'] = layer['query_shape'][-2]
+
+    output_shape = layer['query_shape']
+
+    layer['attention_axes'] = [1]
+    layer['softmax_axis'] = [3]
+
+    in_proj_weights = class_object.in_proj_weight.data.numpy()
+    in_proj_bias = class_object.in_proj_bias.data.numpy()
+
+    weight_data = np.split(in_proj_weights, [class_object.embed_dim, class_object.embed_dim + class_object.kdim], axis=0)
+    bias_data = np.split(in_proj_bias, [class_object.embed_dim, class_object.embed_dim + class_object.kdim], axis=0)
+
+    for weight_type, weight, bias in zip(['query', 'key', 'value'], weight_data, bias_data):
+        layer[f'{weight_type}_weight_data'] = weight.T.reshape(
+            layer['feature_dim'], layer['num_heads'], layer['head_dim_key']
+        ).transpose(1, 0, 2)
+        layer[f'{weight_type}_bias_data'] = bias.reshape(layer['num_heads'], layer['head_dim_key'])
+
+    layer['attention_output_weight_data'] = class_object.out_proj.weight.data.numpy().T.reshape(
+        layer['num_heads'], layer['head_dim_key'], layer['feature_dim']
+    )
+    layer['attention_output_bias_data'] = class_object.out_proj.bias.data.numpy()
+
+    return layer, output_shape
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
index 87a99a5bf0..558db6f252 100644
--- a/test/pytest/test_layernorm_pytorch.py
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -21,9 +21,7 @@ def data():
 
 @pytest.fixture(scope='module')
 def model():
-    model = nn.Sequential(
-        nn.LayerNorm(in_shape[-1]),
-    ).to()
+    model = nn.Sequential(nn.LayerNorm(in_shape[-1]))
     model.eval()
     return model
 
diff --git a/test/pytest/test_multiheadattention.py b/test/pytest/test_multiheadattention.py
index db03788ed3..1052039791 100644
--- a/test/pytest/test_multiheadattention.py
+++ b/test/pytest/test_multiheadattention.py
@@ -9,31 +9,28 @@
 
 test_root_path = Path(__file__).parent
 
-batch_size = 50
-seq_len_q = 10
-seq_len_kv = 8
+batch_size = 100
+seq_len = 10
 num_heads = 2
-key_dim = 2
+key_dim = 4
 
-atol = 1e-1  # it's pretty bad right now
-
-np.random.seed(0)
+atol = 2e-2
 
 
 @pytest.fixture(scope='module')
 def query_data():
-    return np.random.rand(batch_size, seq_len_q, num_heads * key_dim)
+    return np.random.rand(batch_size, seq_len, num_heads * key_dim)
 
 
 @pytest.fixture(scope='module')
 def key_value_data():
-    return np.random.rand(batch_size, seq_len_kv, num_heads * key_dim)
+    return np.random.rand(batch_size, seq_len, num_heads * key_dim)
 
 
 @pytest.fixture(scope='module')
 def model():
-    query_input = Input(shape=(seq_len_q, num_heads * key_dim), batch_size=batch_size)
-    key_value_input = Input(shape=(seq_len_kv, num_heads * key_dim), batch_size=batch_size)
+    query_input = Input(shape=(seq_len, num_heads * key_dim))
+    key_value_input = Input(shape=(seq_len, num_heads * key_dim))
     mha_layer = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(query_input, key_value_input)
     model = Model(inputs=[query_input, key_value_input], outputs=mha_layer)
     model.compile()
diff --git a/test/pytest/test_multiheadattention_pytorch.py b/test/pytest/test_multiheadattention_pytorch.py
new file mode 100644
index 0000000000..862a0784fc
--- /dev/null
+++ b/test/pytest/test_multiheadattention_pytorch.py
@@ -0,0 +1,67 @@
+from pathlib import Path
+
+import numpy as np
+import pytest
+import torch
+from torch import nn
+
+import hls4ml
+
+test_root_path = Path(__file__).parent
+
+batch_size = 100
+seq_len = 10
+num_heads = 2
+embed_dim = 8
+
+atol = 2e-2
+
+
+@pytest.fixture(scope='module')
+def query_data():
+    return np.random.rand(batch_size, seq_len, embed_dim)
+
+
+@pytest.fixture(scope='module')
+def key_value_data():
+    return np.random.rand(batch_size, seq_len, embed_dim)
+
+
+class MultiHeadAttentionModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mha = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
+
+    def forward(self, query, key, value):
+        output, _ = self.mha(query, key, value)
+        return output
+
+
+# Currently only Vivado in io_parallel mode is supported
+def test_multiheadattention(query_data, key_value_data):
+    model = MultiHeadAttentionModel()
+    model.eval()
+
+    config = hls4ml.utils.config_from_pytorch_model(
+        model,
+        [(seq_len, embed_dim), (seq_len, embed_dim), (seq_len, embed_dim)],
+        granularity='name',
+        backend='Vivado',
+        channels_last_conversion='off',
+        transpose_outputs=False,
+    )
+    output_dir = str(test_root_path / 'hls4mlprj_multiheadattention_pytorch_Vivado_io_parallel')
+    hls_model = hls4ml.converters.convert_from_pytorch_model(
+        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
+    )
+    hls_model.compile()
+
+    # Predict
+    y_pytorch = (
+        model(torch.Tensor(query_data), torch.Tensor(key_value_data), torch.Tensor(key_value_data))
+        .detach()
+        .numpy()
+        .flatten()
+    )
+    y_hls = hls_model.predict([query_data, key_value_data, key_value_data]).flatten()
+    np.testing.assert_allclose(y_pytorch, y_hls, rtol=0, atol=atol, verbose=True)

From 5697334cbe6070338a8b8127d6dcbe380a772815 Mon Sep 17 00:00:00 2001
From: Rian Flynn <rian@nooks.in>
Date: Wed, 25 Sep 2024 15:24:16 +0800
Subject: [PATCH 46/55] fiddly precision / accuracy changes for layernorm

---
 hls4ml/backends/vivado/passes/core_templates.py     | 1 +
 hls4ml/backends/vivado/vivado_backend.py            | 8 ++++++--
 hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h | 1 -
 test/pytest/test_layernorm.py                       | 2 +-
 test/pytest/test_layernorm_pytorch.py               | 2 +-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 68ef21b882..323b8c3312 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -129,6 +129,7 @@ def format(self, node):
     static constexpr double epsilon = {epsilon};
     typedef {bias_t.name} bias_t;
     typedef {scale_t.name} scale_t;
+    typedef {mean_t.name} mean_t;
     typedef {table_t.name} table_t;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index da5fc158fb..dd98e4cd61 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -519,12 +519,16 @@ def init_softmax(self, layer):
     def init_layernormalization(self, layer):
         if 'table_t' not in layer.attributes:
             layer.set_attr(
-                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=32, integer=8))
+                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=30, integer=10))
             )
         if 'table_size' not in layer.attributes:
-            layer.set_attr('table_size', 2048)  # table size
+            layer.set_attr('table_size', 1024)  # table size
         if 'table_range' not in layer.attributes:
             layer.set_attr('table_range', 1.0)  # table range
+        if 'mean_t' not in layer.attributes:
+            layer.set_attr(
+                'mean_t', NamedType(name=layer.name + '_mean_t', precision=FixedPrecisionType(width=19, integer=6))
+            )
 
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index 6cab828178..4aa994a793 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -14,7 +14,6 @@ struct layernorm_config {
     // Internal data type definitions
     typedef float bias_t;
     typedef float scale_t;
-    typedef ap_fixed<24, 6> mean_t;
 
     // Layer Sizes
     static const unsigned n_in = 20;
diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
index 592558188c..f180c4f318 100644
--- a/test/pytest/test_layernorm.py
+++ b/test/pytest/test_layernorm.py
@@ -10,7 +10,7 @@
 test_root_path = Path(__file__).parent
 
 in_shape = (4, 5)
-atol = 1e-2
+atol = 5e-2
 
 
 @pytest.fixture(scope='module')
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
index 558db6f252..c553bb41f1 100644
--- a/test/pytest/test_layernorm_pytorch.py
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -10,7 +10,7 @@
 test_root_path = Path(__file__).parent
 
 in_shape = (4, 5)
-atol = 1e-2
+atol = 5e-2
 
 
 @pytest.fixture(scope='module')

From a149f2ed166866b5c078d964fcd7f8c956cdf0cc Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Tue, 22 Oct 2024 14:20:33 -0400
Subject: [PATCH 47/55] fix lookup table and label loops

---
 .../vivado/nnet_utils/nnet_layernorm.h        | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index 4aa994a793..f504c8875e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -57,19 +57,18 @@ void lookup_invert_sqr(typename CONFIG_T::mean_t x, typename CONFIG_T::table_t &
         return;
     }
 
-    // Binary search
-    int low = 0;
-    int high = CONFIG_T::table_size - 1;
-    while (high - low > 1) {
-        int mid = (low + high) / 2;
-        if (x > table_in[mid]) {
-            low = mid;
-        } else {
-            high = mid;
+    #pragma HLS PIPELINE
+LAYERNORM_LOOKUP:
+    for (int i = 0; i < CONFIG_T::table_size - 1; i++) {
+        #pragma HLS UNROLL factor=4
+        if (x <= table_in[i + 1] && x >= table_in[i]) {
+            res = table_out[i];
+            return;
         }
     }
 
-    res = table_out[low];
+    res = table_out[CONFIG_T::table_size - 1];
+    return;
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
@@ -105,11 +104,14 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
     #pragma HLS ARRAY_PARTITION variable=data_diff complete
 
     const typename CONFIG_T::mean_t k_inv = 1.0 / dim;
+
+LAYERNORM_1D_SUM:
     for (int i = 0; i < dim; ++i) {
         sum_cache += static_cast<typename CONFIG_T::mean_t>(data[i]);
     }
     mean = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache, k_inv);
 
+LAYERNORM_1D_VAR:
     for (int i = 0; i < dim; ++i) {
         data_diff[i] = static_cast<typename CONFIG_T::mean_t>(data[i]) - mean;
         diff = data_diff[i] * data_diff[i];
@@ -118,6 +120,7 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
     var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
     lookup_invert_sqr<CONFIG_T>(var + var_epsilon, deno_inver, index_table, invert_sqr_table);
 
+LAYERNORM_1D_RESULT:
     for (int i = 0; i < dim; ++i) {
         res[i] = data_diff[i] * deno_inver * scale[i] + bias[i];
     }
@@ -138,15 +141,16 @@ void layernormalize(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in],
     #pragma HLS ARRAY_PARTITION variable=in_val complete
     #pragma HLS ARRAY_PARTITION variable=outval complete
 
+LAYERNORM_SEQ_LOOP:
     for (int j = 0; j < CONFIG_T::seq_len; ++j) {
         #pragma HLS PIPELINE
-    load:
+    LAYERNORM_LOAD:
         for (int i = 0; i < dim; ++i) {
             #pragma HLS UNROLL
             in_val[i] = data[j * dim + i];
         }
         layernorm_1d<data_T, res_T, CONFIG_T>(in_val, outval, scale, bias);
-    store:
+    LAYERNORM_STORE:
         for (int i = 0; i < dim; ++i) {
             #pragma HLS UNROLL
             res[j * dim + i] = outval[i];

From 552fa836c4afae5f7df8a5e5b573411842451b62 Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Wed, 23 Oct 2024 15:26:46 -0400
Subject: [PATCH 48/55] remove dense_seq

---
 .../vivado/passes/transformer_templates.py    |   7 +-
 hls4ml/backends/vivado/vivado_backend.py      |   4 -
 hls4ml/converters/keras/core.py               |   4 -
 hls4ml/converters/keras/qkeras.py             |  35 +-----
 hls4ml/model/layers.py                        |   1 -
 hls4ml/model/profiling.py                     |   1 -
 .../templates/vivado/nnet_utils/nnet_dense.h  | 108 +++++++++---------
 .../vivado/nnet_utils/nnet_dense_seq.h        |  44 -------
 hls4ml/utils/config.py                        |   6 +-
 test/pytest/test_precision_parsing.py         |   2 +-
 10 files changed, 61 insertions(+), 151 deletions(-)
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h

diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
index fafc10859e..8f10a06f22 100644
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ b/hls4ml/backends/vivado/passes/transformer_templates.py
@@ -6,7 +6,6 @@
 mult_config_template = """struct config{index}_{mNum} : nnet::dense_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
-    static const unsigned seq_len = {seq_len};
     static const unsigned strategy = nnet::{strategy};
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
@@ -16,6 +15,8 @@
     typedef {attention_output_bias_t.name} bias_t;
     typedef {attention_output_weight_t.name} weight_t;
     typedef ap_{index_t} index_t;
+    template<class data_T, class res_T, class CONFIG_T>
+    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
@@ -87,7 +88,6 @@ def format(self, node):
         mult_params1['mNum'] = '1'
         mult_params1['n_in'] = node.get_attr('feature_dim')
         mult_params1['n_out'] = node.get_attr('head_dim_key')
-        mult_params1['seq_len'] = 1
         mult_params1['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('query_weight').type.precision
         )
@@ -95,6 +95,7 @@ def format(self, node):
         mult_params1['index'] = str(node.index)
         mult_params1['nzeros'] = 0
         mult_params1['nonzeros'] = params['feature_dim'] * params['num_heads'] * params['head_dim_key']
+        mult_params1['dense_function'] = 'DenseLatency'
         mult_config1 = self.mult1_template.format(**mult_params1)
 
         mult_params2 = self._default_config_params(node)
@@ -102,7 +103,6 @@ def format(self, node):
         mult_params2['mNum'] = '2'
         mult_params2['n_in'] = node.get_attr('head_dim_value') * node.get_attr('num_heads')
         mult_params2['n_out'] = node.get_attr('feature_dim')
-        mult_params2['seq_len'] = 1
         mult_params2['product_type'] = get_backend('vivado').product_type(
             node.get_input_variable().type.precision, node.get_weights('attention_output_weight').type.precision
         )
@@ -110,6 +110,7 @@ def format(self, node):
         mult_params2['index'] = str(node.index)
         mult_params2['nzeros'] = 0
         mult_params2['nonzeros'] = params['feature_dim'] * params['num_heads'] * params['head_dim_key']
+        mult_params2['dense_function'] = 'DenseLatency'
         mult_config2 = self.mult2_template.format(**mult_params2)
 
         act_params = self._default_config_params(node)
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 6a6a896906..53d3548bb9 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -502,10 +502,6 @@ def init_softmax(self, layer):
             layer.set_attr('inv_range', 128)
         if 'exp_range' not in layer.attributes:
             layer.set_attr('exp_range', 8)
-        if layer.model.config.is_resource_strategy(layer):
-            layer.set_attr('implementation', 'latency')
-        else:
-            layer.set_attr('implementation', layer.model.config.get_strategy(layer).lower())
 
         if layer.model.config.get_config_value('IOType') == 'io_parallel':
             assert (
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 20bbabaf49..82e45b6cd5 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -47,10 +47,6 @@ def parse_dense_layer(keras_layer, input_names, input_shapes, data_reader):
         layer['bias_quantizer'] = None
     output_shape = input_shapes[0][:]
     output_shape[-1] = layer['n_out']
-    if len(input_shapes[0]) == 3:
-        layer['seq_len'] = output_shape[-2]
-    else:
-        layer['seq_len'] = 1
 
     return layer, output_shape
 
diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py
index f184b8d540..3ff95ae168 100644
--- a/hls4ml/converters/keras/qkeras.py
+++ b/hls4ml/converters/keras/qkeras.py
@@ -1,8 +1,7 @@
 from qkeras.quantizers import get_quantizer
 
 from hls4ml.converters.keras.convolution import parse_conv1d_layer, parse_conv2d_layer
-from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer, parse_layernorm_layer
-from hls4ml.converters.keras.multiheadattention import parse_mutiheadattention_layer
+from hls4ml.converters.keras.core import parse_batchnorm_layer, parse_dense_layer
 from hls4ml.converters.keras.recurrent import parse_rnn_layer
 from hls4ml.converters.keras_to_hls import keras_handler, parse_default_keras_layer
 from hls4ml.model.quantizers import QKerasBinaryQuantizer, QKerasPO2Quantizer, QKerasQuantizer
@@ -99,7 +98,6 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
         'quantized_bits',
         'binary',
         'ternary',
-        'quantized_softmax',
     ]
 
     layer = parse_default_keras_layer(keras_layer, input_names)
@@ -159,9 +157,6 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
     else:
         layer['class_name'] = 'Activation'
         layer['activation'] = activation_config['class_name'].replace('quantized_', '')
-    if activation_config['class_name'] == 'quantized_softmax':
-        layer['class_name'] = 'Softmax'
-        layer['axis'] = keras_layer['config'].get('axis', -1)
     layer['activation_quantizer'] = activation_config
     return layer, [shape for shape in input_shapes[0]]
 
@@ -186,31 +181,3 @@ def parse_qconv2dbatchnorm_layer(keras_layer, input_names, input_shapes, data_re
     temp_shape = intermediate_shape
     batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader)
     return {**conv_layer, **batch_layer}, out_shape
-
-
-@keras_handler('QMultiHeadAttention')
-def parse_qmultiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    assert 'QMultiHeadAttention' in keras_layer['class_name']
-    assert input_shapes[0] == keras_layer['config']['query_shape']
-
-    layer, output_shape = parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader, config)
-
-    layer['weight_quantizer'] = get_quantizer_from_config(keras_layer, 'kernel')
-    if keras_layer['config']['bias_quantizer'] is not None:
-        layer['bias_quantizer'] = get_quantizer_from_config(keras_layer, 'bias')
-    else:
-        layer['bias_quantizer'] = None
-
-    return layer, output_shape
-
-
-@keras_handler('QLayerNormalization')
-def parse_qlayernorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    layer, output_shape = parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader, config)
-
-    layer['mean_quantizer'] = get_quantizer_from_config(keras_layer, 'mean')
-    layer['variance_quantizer'] = get_quantizer_from_config(keras_layer, 'variance')
-    layer['beta_quantizer'] = get_quantizer_from_config(keras_layer, 'beta')
-    layer['gamma_quantizer'] = get_quantizer_from_config(keras_layer, 'gamma')
-
-    return layer, output_shape
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 2ed2126813..b130580c71 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -399,7 +399,6 @@ class Dense(Layer):
     _expected_attributes = [
         Attribute('n_in'),
         Attribute('n_out'),
-        Attribute('seq_len'),
         WeightAttribute('weight'),
         WeightAttribute('bias'),
         TypeAttribute('weight'),
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 92749705e4..519e8fabc7 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -232,7 +232,6 @@ def weights_hlsmodel(model, fmt='longform', plot='boxplot'):
         else:
             suffix = ['w', 'b']
         name = layer.name
-        # print(name)
         for iw, weight in enumerate(layer.get_weights()):
             label = f'{name}/{suffix[iw]}'
             w = weight.data.flatten()
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
index 61975bb3c4..98f9a1c0f2 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
@@ -1,54 +1,54 @@
-#ifndef NNET_DENSE_H_
-#define NNET_DENSE_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_dense_latency.h"
-#include "nnet_dense_resource.h"
-#include "nnet_dense_seq.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-struct dense_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-
-    // Layer Sizes
-    static const unsigned n_in = 10;
-    static const unsigned n_out = 10;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned strategy = latency;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-    static const unsigned n_zeros = 0;
-    // partitioning arrays cyclically to go with roll factors?
-    // Product function to use
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
-           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    #pragma HLS inline
-    if (CONFIG_T::seq_len > 1) {
-        dense_seq<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        if (CONFIG_T::strategy == nnet::latency) {
-            dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        } else {
-            dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
+#ifndef NNET_DENSE_H_
+#define NNET_DENSE_H_
+
+#include "hls_stream.h"
+#include "nnet_common.h"
+#include "nnet_dense_latency.h"
+#include "nnet_dense_resource.h"
+#include "nnet_function_stubs.h"
+#include "nnet_helpers.h"
+#include "nnet_mult.h"
+#include <math.h>
+
+namespace nnet {
+
+struct dense_config {
+    // Internal data type definitions
+    typedef float bias_t;
+    typedef float weight_t;
+    typedef float accum_t;
+
+    // Layer Sizes
+    static const unsigned n_in = 10;
+    static const unsigned n_out = 10;
+
+    // Resource reuse info
+    static const unsigned io_type = io_parallel;
+    static const unsigned strategy = latency;
+    static const unsigned reuse_factor = 1;
+    static const bool store_weights_in_bram = false;
+    static const unsigned n_zeros = 0;
+    // partitioning arrays cyclically to go with roll factors?
+    // Product function to use
+    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
+};
+
+template <class data_T, class res_T, typename CONFIG_T>
+void dense(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
+           typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
+           typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
+    #pragma HLS inline
+    if (CONFIG_T::seq_len > 1) {
+        dense_seq<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+    } else {
+        if (CONFIG_T::strategy == nnet::latency) {
+            dense_latency<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        } else {
+            dense_resource<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        }
+    }
+}
+
+} // namespace nnet
+
+#endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
deleted file mode 100644
index 4b6e0d08e7..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_seq.h
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef NNET_DENSE_SEQ_H_
-#define NNET_DENSE_SEQ_H_
-
-#include "hls_stream.h"
-#include "nnet_common.h"
-#include "nnet_helpers.h"
-#include "nnet_mult.h"
-#include <math.h>
-
-namespace nnet {
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_seq(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
-               typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
-               typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
-    #pragma HLS inline
-
-    data_T in_val[CONFIG_T::n_in];
-    #pragma HLS ARRAY_PARTITION variable=in_val complete
-
-    if (CONFIG_T::strategy == nnet::latency) {
-        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-            for (int i = 0; i < CONFIG_T::n_in; ++i) {
-                #pragma HLS UNROLL
-                in_val[i] = data[j * CONFIG_T::n_in + i];
-            }
-            dense_latency<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
-        }
-    } else {
-        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-            #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-            for (int i = 0; i < CONFIG_T::n_in; ++i) {
-                #pragma HLS UNROLL
-                in_val[i] = data[j * CONFIG_T::n_in + i];
-            }
-            dense_resource<data_T, res_T, CONFIG_T>(in_val, res + (CONFIG_T::n_out * j), weights, biases);
-        }
-    }
-}
-
-} // namespace nnet
-
-#endif
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index 8a88493ca2..5cd17d02e9 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -72,11 +72,7 @@ def _get_precision_from_quantizer(quantizer):
     overflow = "AP_WRAP"
 
     if quantizer['class_name'] in supported_quantizers:
-        bits = quantizer['config']['bits']
-        if isinstance(bits, list):
-            bits = int(bits[0])
-        else:
-            bits = int(bits)
+        bits = int(quantizer['config']['bits'])
         # if integer isn't specified, it should be the same as bits
         integer = int(quantizer['config'].get('integer', bits - 1)) + 1
         # for quantizers use the following default rounding and overflow
diff --git a/test/pytest/test_precision_parsing.py b/test/pytest/test_precision_parsing.py
index 5569a3a6ad..2796bece57 100644
--- a/test/pytest/test_precision_parsing.py
+++ b/test/pytest/test_precision_parsing.py
@@ -21,7 +21,7 @@
     ],
 )
 def test_sign_parsing(prec_pair):
-    '''Test that convert_precions_string determines the signedness correctly'''
+    '''Test that convert_precision_string determines the signedness correctly'''
     strprec = prec_pair[0]
     signed = prec_pair[1]
 

From be5f5a4fcb7f692d9d623e1ea021c91ef29fed74 Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Wed, 23 Oct 2024 15:58:18 -0400
Subject: [PATCH 49/55] undo qkeras changes

---
 hls4ml/converters/keras/qkeras.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hls4ml/converters/keras/qkeras.py b/hls4ml/converters/keras/qkeras.py
index 3ff95ae168..7357d95aed 100644
--- a/hls4ml/converters/keras/qkeras.py
+++ b/hls4ml/converters/keras/qkeras.py
@@ -157,6 +157,7 @@ def parse_qactivation_layer(keras_layer, input_names, input_shapes, data_reader)
     else:
         layer['class_name'] = 'Activation'
         layer['activation'] = activation_config['class_name'].replace('quantized_', '')
+
     layer['activation_quantizer'] = activation_config
     return layer, [shape for shape in input_shapes[0]]
 

From adf7356d8b186491df7b1fb971ca0892ccbe99a0 Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Thu, 24 Oct 2024 16:45:53 -0400
Subject: [PATCH 50/55] fix merge conflict residue

---
 hls4ml/templates/vivado/nnet_utils/nnet_dense.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
index dbdcd934ed..d6c7beb70e 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense.h
@@ -38,7 +38,7 @@ struct dense_config {
 };
 
 template <class data_T, class res_T, typename CONFIG_T>
-void dense(data_T data[CONFIG_T::n_in * CONFIG_T::seq_len], res_T res[CONFIG_T::n_out * CONFIG_T::seq_len],
+void dense(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_out],
            typename CONFIG_T::weight_t weights[CONFIG_T::n_in * CONFIG_T::n_out],
            typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) {
     #pragma HLS INLINE

From 39ab36ce118d4b4da3ee198b77baa53cd14aa55d Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Mon, 4 Nov 2024 10:17:16 -0500
Subject: [PATCH 51/55] remove non-layernorm changes

---
 hls4ml/backends/fpga/fpga_backend.py          |   2 -
 .../backends/vivado/passes/core_templates.py  |   4 -
 .../vivado/passes/transformer_templates.py    | 149 ----
 hls4ml/backends/vivado/vivado_backend.py      |  33 -
 hls4ml/converters/keras/multiheadattention.py |  60 --
 hls4ml/converters/keras_to_hls.py             | 664 +++++++++---------
 hls4ml/converters/pytorch/core.py             |   4 -
 .../converters/pytorch/multiheadattention.py  |  54 --
 hls4ml/model/layers.py                        |  49 --
 .../model/optimizer/passes/infer_precision.py |  58 --
 .../vivado/nnet_utils/nnet_activation.h       |  83 +--
 .../nnet_utils/nnet_multiheadattention.h      | 324 ---------
 test/pytest/test_multiheadattention.py        |  52 --
 .../pytest/test_multiheadattention_pytorch.py |  67 --
 test/pytest/test_precision_parsing.py         |  29 -
 15 files changed, 372 insertions(+), 1260 deletions(-)
 delete mode 100644 hls4ml/backends/vivado/passes/transformer_templates.py
 delete mode 100644 hls4ml/converters/keras/multiheadattention.py
 delete mode 100644 hls4ml/converters/pytorch/multiheadattention.py
 delete mode 100644 hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
 delete mode 100644 test/pytest/test_multiheadattention.py
 delete mode 100644 test/pytest/test_multiheadattention_pytorch.py
 delete mode 100644 test/pytest/test_precision_parsing.py

diff --git a/hls4ml/backends/fpga/fpga_backend.py b/hls4ml/backends/fpga/fpga_backend.py
index c11573f77e..a9fc09b7aa 100644
--- a/hls4ml/backends/fpga/fpga_backend.py
+++ b/hls4ml/backends/fpga/fpga_backend.py
@@ -26,7 +26,6 @@
     GlobalPooling2D,
     MatMul,
     Merge,
-    MultiHeadAttention,
     Pooling1D,
     Pooling2D,
     Quant,
@@ -71,7 +70,6 @@ def __init__(self, name):
             Dot,
             Conv,
             MatMul,
-            MultiHeadAttention,
         ]
 
         for layer in accum_layers:
diff --git a/hls4ml/backends/vivado/passes/core_templates.py b/hls4ml/backends/vivado/passes/core_templates.py
index 19a715f101..c7f5b490a9 100644
--- a/hls4ml/backends/vivado/passes/core_templates.py
+++ b/hls4ml/backends/vivado/passes/core_templates.py
@@ -17,7 +17,6 @@
 dense_config_template = """struct config{index} : nnet::dense_config {{
     static const unsigned n_in = {n_in};
     static const unsigned n_out = {n_out};
-    static const unsigned seq_len = {seq_len};
     static const unsigned io_type = nnet::{iotype};
     static const unsigned strategy = nnet::{strategy};
     static const unsigned reuse_factor = {reuse};
@@ -220,9 +219,6 @@ def format(self, node):
     static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
     typedef {exp_table_t.name} exp_table_t;
     typedef {inv_table_t.name} inv_table_t;
-    typedef {accum_t.name} accum_t;
-    static const unsigned inv_range = {inv_range};
-    static const unsigned exp_range = {exp_range};
 }};\n"""
 
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
diff --git a/hls4ml/backends/vivado/passes/transformer_templates.py b/hls4ml/backends/vivado/passes/transformer_templates.py
deleted file mode 100644
index 8f10a06f22..0000000000
--- a/hls4ml/backends/vivado/passes/transformer_templates.py
+++ /dev/null
@@ -1,149 +0,0 @@
-from hls4ml.backends.backend import get_backend
-from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
-from hls4ml.model.layers import MultiHeadAttention
-
-# dense layer template
-mult_config_template = """struct config{index}_{mNum} : nnet::dense_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_out = {n_out};
-    static const unsigned strategy = nnet::{strategy};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned n_zeros = {nzeros};
-    static const unsigned n_nonzeros = {nonzeros};
-    static const bool store_weights_in_bram = false;
-    typedef {accum_t.name} accum_t;
-    typedef {attention_output_bias_t.name} bias_t;
-    typedef {attention_output_weight_t.name} weight_t;
-    typedef ap_{index_t} index_t;
-    template<class data_T, class res_T, class CONFIG_T>
-    using kernel = nnet::{dense_function}<data_T, res_T, CONFIG_T>;
-    template<class x_T, class y_T>
-    using product = nnet::product::{product_type}<x_T, y_T>;
-}};\n"""
-
-# activation template
-softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned table_size = {table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
-    typedef {table_t.name} exp_table_t;
-    typedef {table_t.name} inv_table_t;
-    typedef {accum_t.name} accum_t;
-    static const unsigned inv_range = {inv_range};
-    static const unsigned exp_range = {exp_range};
-}};\n"""
-
-
-mha_config_template = """struct config{index} : nnet::multiheadattention_config {{
-    typedef {accum_t.name} accum_t;
-    typedef {attention_output_bias_t.name} bias_t;
-    typedef {attention_output_weight_t.name} weight_t;
-    typedef {config_mult_t1} config_mult1;
-    typedef {config_mult_t2} config_mult2;
-    typedef {config_activ_t1} softmax_config1;
-
-    static const unsigned num_heads = {num_heads};
-    static const unsigned head_dim_key = {head_dim_key};
-    static const unsigned head_dim_value = {head_dim_value};
-    static const unsigned feature_dim = {feature_dim};
-    static const unsigned seq_len = {seq_len};
-
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const bool store_weights_in_bram = false;
-}};\n"""
-
-
-mha_function_template = """nnet::multiheadattention<{input_t}, {output_t}, {config}>({input_q}, {input_kv},
-                            {output}, {w_o}, {b_o}, {w_k}, {b_k}, {w_q}, {b_q}, {w_v}, {b_v});"""
-
-mha_include_list = ['nnet_utils/nnet_multiheadattention.h']
-
-
-class MhaConfigTemplate(LayerConfigTemplate):
-    def __init__(self):
-        super().__init__(MultiHeadAttention)
-        self.template = mha_config_template
-        self.mult1_template = mult_config_template
-        self.mult2_template = mult_config_template
-        self.activ1_template = softmax_config_template
-
-    def format(self, node):
-        params = self._default_config_params(node)
-        params['num_heads'] = node.get_attr('num_heads')
-        params['head_dim_key'] = node.get_attr('head_dim_key')
-        params['head_dim_value'] = node.get_attr('head_dim_value')
-        params['feature_dim'] = node.get_attr('feature_dim')
-        params['seq_len'] = node.get_attr('seq_len')
-        params['config_mult_t1'] = f'config{node.index}_1'
-        params['config_mult_t2'] = f'config{node.index}_2'
-        params['config_activ_t1'] = '{}_config{}'.format("softmax", node.index)
-        params['strategy'] = node.get_attr('strategy')
-        mha_config = self.template.format(**params)
-
-        mult_params1 = self._default_config_params(node)
-        mult_params1['strategy'] = 'latency'
-        mult_params1['mNum'] = '1'
-        mult_params1['n_in'] = node.get_attr('feature_dim')
-        mult_params1['n_out'] = node.get_attr('head_dim_key')
-        mult_params1['product_type'] = get_backend('vivado').product_type(
-            node.get_input_variable().type.precision, node.get_weights('query_weight').type.precision
-        )
-        mult_params1['reuse'] = params['reuse']
-        mult_params1['index'] = str(node.index)
-        mult_params1['nzeros'] = 0
-        mult_params1['nonzeros'] = params['feature_dim'] * params['num_heads'] * params['head_dim_key']
-        mult_params1['dense_function'] = 'DenseLatency'
-        mult_config1 = self.mult1_template.format(**mult_params1)
-
-        mult_params2 = self._default_config_params(node)
-        mult_params2['strategy'] = 'latency'
-        mult_params2['mNum'] = '2'
-        mult_params2['n_in'] = node.get_attr('head_dim_value') * node.get_attr('num_heads')
-        mult_params2['n_out'] = node.get_attr('feature_dim')
-        mult_params2['product_type'] = get_backend('vivado').product_type(
-            node.get_input_variable().type.precision, node.get_weights('attention_output_weight').type.precision
-        )
-        mult_params2['reuse'] = params['reuse']
-        mult_params2['index'] = str(node.index)
-        mult_params2['nzeros'] = 0
-        mult_params2['nonzeros'] = params['feature_dim'] * params['num_heads'] * params['head_dim_key']
-        mult_params2['dense_function'] = 'DenseLatency'
-        mult_config2 = self.mult2_template.format(**mult_params2)
-
-        act_params = self._default_config_params(node)
-        act_params['n_in'] = node.get_attr('seq_len')
-        act_params['type'] = 'softmax'
-        act_params['implementation'] = 'legacy'  # in MHA: latency,stable not work， legacy works
-        act_config = self.activ1_template.format(**act_params)
-
-        return mult_config1 + '\n' + mult_config2 + '\n' + act_config + '\n' + mha_config
-
-
-class MhaFunctionTemplate(FunctionCallTemplate):
-    def __init__(self):
-        super().__init__(MultiHeadAttention, include_header=mha_include_list)
-        self.template = mha_function_template
-
-    def format(self, node):
-        params = {}
-        params.update(node.attributes)
-        params['config'] = f'config{node.index}'
-        params['input_t'] = node.get_input_variable().type.name
-        params['output_t'] = node.get_output_variable().type.name
-
-        params['input_q'] = node.model.get_layer_output_variable(node.inputs[0]).name
-        params['input_kv'] = node.model.get_layer_output_variable(node.inputs[1]).name
-        params['output'] = node.get_output_variable().name
-        params['w_o'] = node.get_weights('attention_output_weight').name
-        params['b_o'] = node.get_weights('attention_output_bias').name
-        params['w_k'] = node.get_weights('key_weight').name
-        params['b_k'] = node.get_weights('key_bias').name
-        params['w_q'] = node.get_weights('query_weight').name
-        params['b_q'] = node.get_weights('query_bias').name
-        params['w_v'] = node.get_weights('value_weight').name
-        params['b_v'] = node.get_weights('value_bias').name
-
-        return self.template.format(**params)
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index de81d2c7a0..452e40ddbb 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -22,7 +22,6 @@
     GarNetStack,
     Layer,
     LayerNormalization,
-    MultiHeadAttention,
     Pooling1D,
     Pooling2D,
     SeparableConv1D,
@@ -544,17 +543,6 @@ def init_pooling2d(self, layer):
 
     @layer_optimizer(Softmax)
     def init_softmax(self, layer):
-        if 'exp_table_t' not in layer.attributes:
-            layer.set_attr('exp_table_t', layer.get_attr('table_t'))
-        if 'inv_table_t' not in layer.attributes:
-            layer.set_attr('inv_table_t', layer.get_attr('table_t'))
-        if 'accum_t' not in layer.attributes:
-            layer.set_attr('accum_t', FixedPrecisionType(width=18, integer=8))
-        if 'inv_range' not in layer.attributes:
-            layer.set_attr('inv_range', 128)
-        if 'exp_range' not in layer.attributes:
-            layer.set_attr('exp_range', 8)
-
         if layer.model.config.get_config_value('IOType') == 'io_parallel':
             assert (
                 len(layer.get_input_variable().shape) == 1
@@ -677,24 +665,3 @@ def init_garnet(self, layer):
     @layer_optimizer(GarNetStack)
     def init_garnet_stack(self, layer):
         self.init_garnet(layer)
-
-    @layer_optimizer(MultiHeadAttention)
-    def init_mha(self, layer):
-        # TODO Allow getting recurrent reuse factor from the config
-        reuse_factor = layer.model.config.get_reuse_factor(layer)
-        layer.set_attr('reuse_factor', reuse_factor)
-        index_t = IntegerPrecisionType(width=1, signed=False)
-        layer.set_attr('index_t', index_t)
-        if 'table_t' not in layer.attributes:
-            layer.set_attr(
-                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=24, integer=8))
-            )
-        if 'table_size' not in layer.attributes:
-            layer.set_attr('table_size', 2048)
-        if 'accum_t' not in layer.attributes:
-            layer.set_attr('accum_t', FixedPrecisionType(width=24, integer=8))
-        if 'inv_range' not in layer.attributes:
-            layer.set_attr('inv_range', 128)
-        if 'exp_range' not in layer.attributes:
-            layer.set_attr('exp_range', 8)
-        layer.set_attr('strategy', 'resource')  # latency
diff --git a/hls4ml/converters/keras/multiheadattention.py b/hls4ml/converters/keras/multiheadattention.py
deleted file mode 100644
index c295236561..0000000000
--- a/hls4ml/converters/keras/multiheadattention.py
+++ /dev/null
@@ -1,60 +0,0 @@
-from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer
-
-
-@keras_handler('MultiHeadAttention')
-def parse_mutiheadattention_layer(keras_layer, input_names, input_shapes, data_reader):
-    # assume input_shapes is: [[None, seq, dim]]
-    assert 'MultiHeadAttention' in keras_layer['class_name']
-    assert input_shapes[0] == keras_layer['config']['query_shape']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    layer['num_heads'] = keras_layer['config']['num_heads']
-    layer['head_dim_key'] = keras_layer['config']['key_dim']
-    layer['head_dim_value'] = keras_layer['config']['value_dim']
-    layer['query_shape'] = keras_layer['config']['query_shape']
-    layer['key_shape'] = keras_layer['config']['key_shape']
-    layer['value_shape'] = keras_layer['config']['value_shape']
-    layer['feature_dim'] = layer['query_shape'][-1]
-    layer['seq_len'] = layer['query_shape'][-2]
-
-    if keras_layer['config']['output_shape']:
-        raise Exception('hls4ml does not support a defined output shape, the output shape must equal to the query shape')
-    else:
-        output_shape = layer['query_shape']
-
-    layer['attention_axes'] = (
-        keras_layer['config']['attention_axes'] if (keras_layer['config']['attention_axes'][0] == 1) else False
-    )
-    if layer['attention_axes'] is False:
-        raise Exception('assigning the attention_axes is not currently supported by hls4ml')
-
-    if not (len(layer['query_shape']) == 3 and len(layer['key_shape']) == 3 and len(layer['value_shape']) == 3):
-        raise Exception('only 3D shapes for query, key, and value are currently supported by hls4ml')
-
-    attn_scores_rank = 4
-    layer['softmax_axis'] = list(range(attn_scores_rank - len(layer['attention_axes']), attn_scores_rank))
-
-    weights_sources = [
-        ('attention_output', 'kernel'),
-        ('attention_output', 'bias'),
-        ('key', 'kernel'),
-        ('key', 'bias'),
-        ('query', 'kernel'),
-        ('query', 'bias'),
-        ('value', 'kernel'),
-        ('value', 'bias'),
-    ]
-
-    for lname, wtype in weights_sources:
-        data = get_weights_data(data_reader, layer['name'], f'{lname}/{wtype}')
-        if wtype == 'kernel':
-            vtype = 'weight'
-            if lname in ['key', 'query', 'value']:
-                data = data.transpose((1, 0, 2))
-        else:
-            vtype = 'bias'
-
-        layer[f'{lname}_{vtype}_data'] = data
-
-    return layer, output_shape
diff --git a/hls4ml/converters/keras_to_hls.py b/hls4ml/converters/keras_to_hls.py
index 50acb27e8f..e31e2b96a9 100644
--- a/hls4ml/converters/keras_to_hls.py
+++ b/hls4ml/converters/keras_to_hls.py
@@ -1,334 +1,330 @@
-import json
-
-import h5py
-
-from hls4ml.model import ModelGraph
-
-MAXMULT = 4096
-
-
-class KerasReader:
-    def get_weights_data(self, layer_name, var_name):
-        raise NotImplementedError
-
-
-class KerasFileReader(KerasReader):
-    def __init__(self, config):
-        self.config = config
-        self.h5file = h5py.File(config['KerasH5'], mode='r')
-
-    def __del__(self):
-        if self.h5file:
-            self.h5file.close()
-
-    def _find_data(self, layer_name, var_name):
-        def h5_visitor_func(name):
-            if var_name in name:
-                return name
-
-        if 'model_weights' in list(self.h5file.keys()):  # h5 file comes from model.save()
-            layer_path = f'model_weights/{layer_name}'
-        else:
-            layer_path = layer_name
-
-        data_path = self.h5file[layer_path].visit(h5_visitor_func)
-        if data_path:
-            return self.h5file[f'/{layer_path}/{data_path}']
-        else:
-            return None
-
-    def get_weights_data(self, layer_name, var_name):
-        data = self._find_data(layer_name, var_name)
-        if data:
-            return data[()]
-        else:
-            return None
-
-
-class KerasNestedFileReader(KerasFileReader):
-    def __init__(self, data_reader, nested_path):
-        super().__init__(data_reader.config)
-        self.nested_path = nested_path
-
-    def _find_data(self, layer_name, var_name):
-        def h5_visitor_func(name):
-            if var_name in name:
-                return name
-
-        layer_path = f'model_weights/{self.nested_path}/{layer_name}'
-
-        data_path = self.h5file[layer_path].visit(h5_visitor_func)
-        if data_path:
-            return self.h5file[f'/{layer_path}/{data_path}']
-        else:
-            return None
-
-
-class KerasModelReader(KerasReader):
-    def __init__(self, keras_model):
-        self.model = keras_model
-
-    def get_weights_data(self, layer_name, var_name):
-        layer = self.model.get_layer(layer_name)
-        for i, w in enumerate(layer.weights):
-            if var_name in w.name:
-                try:
-                    return w.numpy()  # TF 2.x
-                except Exception:
-                    return layer.get_weights()[i]  # TF 1.x
-
-        return None
-
-
-def get_weights_data(data_reader, layer_name, var_name):
-    if not isinstance(var_name, (list, tuple)):
-        var_name = [var_name]
-
-    data = [data_reader.get_weights_data(layer_name, var) for var in var_name]
-
-    if len(data) == 1:
-        return data[0]
-    else:
-        return (*data,)
-
-
-layer_handlers = {}
-
-
-def register_keras_layer_handler(layer_cname, handler_func):
-    """Register a handler function for the given layer class name.
-
-    The handler function should have the following signature:
-        parse_func(keras_layer, input_names, input_shapes, data_reader, config):
-
-    Args:
-        layer_cname (str): The name of Keras layer (the 'class_name' property in the layer's config)
-        handler_func (callable): The handler function
-
-    Raises:
-        Exception: If the layer class has already been registered.
-    """
-    if layer_cname in layer_handlers:
-        raise Exception(f'Layer {layer_cname} already registered')
-    else:
-        layer_handlers[layer_cname] = handler_func
-
-
-def get_supported_keras_layers():
-    """Returns the list of Keras layers that the converter can parse.
-
-    The returned list contains all Keras layers that can be parsed into the hls4ml internal representation. Support for
-    computation of these layers may vary across hls4ml backends and conversion configuration.
-
-    Returns:
-        list: The names of supported Keras layers.
-    """
-    return list(layer_handlers.keys())
-
-
-def keras_handler(*args):
-    def decorator(function):
-        function.handles = [arg for arg in args]
-        return function
-
-    return decorator
-
-
-def parse_default_keras_layer(keras_layer, input_names):
-    layer = {}
-
-    # Extract name for finding weights and biases
-    layer['name'] = keras_layer['config']['name']
-    layer['class_name'] = keras_layer['class_name']
-    if input_names is not None:
-        layer['inputs'] = input_names
-
-    layer['data_format'] = keras_layer['config'].get('data_format', 'channels_last')
-
-    if 'activation' in keras_layer['config']:
-        layer['activation'] = keras_layer['config']['activation']
-    if 'epsilon' in keras_layer['config']:
-        layer['epsilon'] = keras_layer['config']['epsilon']
-    if 'use_bias' in keras_layer['config']:
-        layer['use_bias'] = keras_layer['config']['use_bias']
-
-    return layer
-
-
-def get_model_arch(config):
-    if 'KerasModel' in config:
-        # Model instance passed in config from API
-        keras_model = config['KerasModel']
-        if isinstance(keras_model, str):
-            from tensorflow.keras.models import load_model
-
-            keras_model = load_model(keras_model)
-        model_arch = json.loads(keras_model.to_json())
-        reader = KerasModelReader(keras_model)
-    elif 'KerasJson' in config:
-        # Extract model architecture from json
-        with open(config['KerasJson']) as json_file:
-            model_arch = json.load(json_file)
-        reader = KerasFileReader(config)
-    elif 'KerasH5' in config:
-        # Model arch and weights are in H5 file (from model.save() function)
-        with h5py.File(config['KerasH5'], mode='r') as h5file:
-            # Load the configuration from h5 using json's decode
-            model_arch = h5file.attrs.get('model_config')
-            if model_arch is None:
-                raise ValueError('No model found in config file.')
-            else:
-                # model_arch is string by default since h5py 3.0.0, keeping this condition for compatibility.
-                if isinstance(model_arch, bytes):
-                    model_arch = model_arch.decode('utf-8')
-                model_arch = json.loads(model_arch)
-        reader = KerasFileReader(config)
-    else:
-        raise ValueError('No model found in config file.')
-
-    return model_arch, reader
-
-
-def parse_keras_model(model_arch, reader):
-    # This is a list of dictionaries to hold all the layer info we need to generate HLS
-    layer_list = []
-
-    # Define layers to skip for conversion to HLS
-    skip_layers = ['Dropout']
-    # Activation layers
-    activation_layers = [
-        'Activation',
-        'LeakyReLU',
-        'ThresholdedReLU',
-        'ELU',
-        'PReLU',
-        'Softmax',
-        'TernaryTanh',
-        'HardActivation',
-        'UnaryLUT',
-        'HGQ>UnaryLUT',
-    ]
-    # Recurrent layers
-    recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU']
-    # All supported layers
-    supported_layers = get_supported_keras_layers() + skip_layers
-
-    # Map inputs of skipped and split (activation) layers
-    inputs_map = {}
-
-    # Loop through layers
-    layer_counter = 0
-
-    input_layers = None
-    output_layers = None
-
-    layer_config = None
-    if model_arch['class_name'] == 'Sequential':
-        print('Interpreting Sequential')
-        layer_config = model_arch['config']
-        if 'layers' in layer_config:  # Newer Keras versions have 'layers' in 'config' key
-            layer_config = layer_config['layers']
-        # Sequential doesn't have InputLayer in TF < 2.3 (Keras 2.4.0)
-        if layer_config[0]['class_name'] != 'InputLayer':
-            input_layer = {}
-            input_layer['name'] = 'input1'
-            input_layer['class_name'] = 'InputLayer'
-            input_layer['input_shape'] = layer_config[0]['config']['batch_input_shape'][1:]
-            layer_list.append(input_layer)
-            print('Input shape:', input_layer['input_shape'])
-    elif model_arch['class_name'] in ['Model', 'Functional']:  # TF >= 2.3 calls it 'Functional' API
-        print('Interpreting Model')
-        layer_config = model_arch['config']['layers']
-        input_layers = [inp[0] for inp in model_arch['config']['input_layers']]
-        output_layers = [out[0] for out in model_arch['config']['output_layers']]
-
-    # Get input shape and check for unsupported layer type
-    for keras_layer in layer_config:
-        if keras_layer['class_name'] not in supported_layers:
-            raise Exception('ERROR: Unsupported layer type: {}'.format(keras_layer['class_name']))
-
-    output_shapes = {}
-    output_shape = None
-
-    print('Topology:')
-    for keras_layer in layer_config:
-        if 'batch_input_shape' in keras_layer['config']:
-            if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
-                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
-            else:
-                input_shapes = [keras_layer['config']['batch_input_shape']]
-        else:
-            if 'inbound_nodes' in keras_layer:
-                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
-            else:
-                # Sequential model, so output_shape from the previous layer is still valid
-                input_shapes = [output_shape]
-
-        keras_class = keras_layer['class_name']
-
-        if keras_class in skip_layers:
-            if 'inbound_nodes' in keras_layer:
-                name = keras_layer['config']['name']
-                # Currently supported skipped layers have only one input
-                parent_input = keras_layer['inbound_nodes'][0][0][0]
-                # Skipped layers can follow each other (e.g., Dropout -> Flatten)
-                inputs_map[name] = inputs_map.get(parent_input, parent_input)
-
-            output_shapes[keras_layer['config']['name']] = input_shapes[0]
-
-            continue
-
-        if keras_class in supported_layers:
-            layer_counter = layer_counter + 1
-
-        # Extract inbound nodes
-        if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
-            input_names = [inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0]]
-            if keras_layer['inbound_nodes'][0][0][-1]:
-                # multi_head_attention  has inbound: [[['input_3', 0, 0, {'value': ['dense_3', 0, 0]}]]]
-                inputname2 = list(keras_layer['inbound_nodes'][0][0][-1].values())
-                input_names += [inp[0] for inp in inputname2]
-        else:
-            input_names = None
-
-        layer, output_shape = layer_handlers[keras_class](keras_layer, input_names, input_shapes, reader)
-
-        print(
-            'Layer name: {}, layer type: {}, input shapes: {}, output shape: {}'.format(
-                layer['name'], layer['class_name'], input_shapes, output_shape
-            )
-        )
-        layer_list.append(layer)
-        if 'activation' in layer and layer['class_name'] not in activation_layers + recurrent_layers:  # + qkeras_layers:
-            act_layer = {}
-            act_details = layer['activation']
-            # Workaround for QKeras activations passed as an argument
-            if isinstance(act_details, dict):
-                act_layer['class_name'] = 'QActivation'
-                act_layer['config'] = {
-                    'name': layer['name'] + '_' + act_details['class_name'],
-                    'activation': act_details,
-                }
-            else:
-                act_layer['class_name'] = 'Activation'
-                act_layer['config'] = {'name': layer['name'] + '_' + act_details, 'activation': act_details}
-            act_layer, output_shape = layer_handlers[act_layer['class_name']](act_layer, None, [output_shape], reader)
-            inputs_map[layer['name']] = act_layer['name']
-            if output_layers is not None and layer['name'] in output_layers:
-                output_layers = [act_layer['name'] if name == layer['name'] else name for name in output_layers]
-            output_shapes[act_layer['name']] = output_shape
-            layer_list.append(act_layer)
-
-        assert output_shape is not None
-
-        output_shapes[layer['name']] = output_shape
-
-    return layer_list, input_layers, output_layers, output_shapes
-
-
-def keras_to_hls(config):
-    model_arch, reader = get_model_arch(config)
-    layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
-    print('Creating HLS model')
-    hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
-    return hls_model
+import json
+
+import h5py
+
+from hls4ml.model import ModelGraph
+
+MAXMULT = 4096
+
+
+class KerasReader:
+    def get_weights_data(self, layer_name, var_name):
+        raise NotImplementedError
+
+
+class KerasFileReader(KerasReader):
+    def __init__(self, config):
+        self.config = config
+        self.h5file = h5py.File(config['KerasH5'], mode='r')
+
+    def __del__(self):
+        if self.h5file:
+            self.h5file.close()
+
+    def _find_data(self, layer_name, var_name):
+        def h5_visitor_func(name):
+            if var_name in name:
+                return name
+
+        if 'model_weights' in list(self.h5file.keys()):  # h5 file comes from model.save()
+            layer_path = f'model_weights/{layer_name}'
+        else:
+            layer_path = layer_name
+
+        data_path = self.h5file[layer_path].visit(h5_visitor_func)
+        if data_path:
+            return self.h5file[f'/{layer_path}/{data_path}']
+        else:
+            return None
+
+    def get_weights_data(self, layer_name, var_name):
+        data = self._find_data(layer_name, var_name)
+        if data:
+            return data[()]
+        else:
+            return None
+
+
+class KerasNestedFileReader(KerasFileReader):
+    def __init__(self, data_reader, nested_path):
+        super().__init__(data_reader.config)
+        self.nested_path = nested_path
+
+    def _find_data(self, layer_name, var_name):
+        def h5_visitor_func(name):
+            if var_name in name:
+                return name
+
+        layer_path = f'model_weights/{self.nested_path}/{layer_name}'
+
+        data_path = self.h5file[layer_path].visit(h5_visitor_func)
+        if data_path:
+            return self.h5file[f'/{layer_path}/{data_path}']
+        else:
+            return None
+
+
+class KerasModelReader(KerasReader):
+    def __init__(self, keras_model):
+        self.model = keras_model
+
+    def get_weights_data(self, layer_name, var_name):
+        layer = self.model.get_layer(layer_name)
+        for i, w in enumerate(layer.weights):
+            if var_name in w.name:
+                try:
+                    return w.numpy()  # TF 2.x
+                except Exception:
+                    return layer.get_weights()[i]  # TF 1.x
+
+        return None
+
+
+def get_weights_data(data_reader, layer_name, var_name):
+    if not isinstance(var_name, (list, tuple)):
+        var_name = [var_name]
+
+    data = [data_reader.get_weights_data(layer_name, var) for var in var_name]
+
+    if len(data) == 1:
+        return data[0]
+    else:
+        return (*data,)
+
+
+layer_handlers = {}
+
+
+def register_keras_layer_handler(layer_cname, handler_func):
+    """Register a handler function for the given layer class name.
+
+    The handler function should have the following signature:
+        parse_func(keras_layer, input_names, input_shapes, data_reader, config):
+
+    Args:
+        layer_cname (str): The name of Keras layer (the 'class_name' property in the layer's config)
+        handler_func (callable): The handler function
+
+    Raises:
+        Exception: If the layer class has already been registered.
+    """
+    if layer_cname in layer_handlers:
+        raise Exception(f'Layer {layer_cname} already registered')
+    else:
+        layer_handlers[layer_cname] = handler_func
+
+
+def get_supported_keras_layers():
+    """Returns the list of Keras layers that the converter can parse.
+
+    The returned list contains all Keras layers that can be parsed into the hls4ml internal representation. Support for
+    computation of these layers may vary across hls4ml backends and conversion configuration.
+
+    Returns:
+        list: The names of supported Keras layers.
+    """
+    return list(layer_handlers.keys())
+
+
+def keras_handler(*args):
+    def decorator(function):
+        function.handles = [arg for arg in args]
+        return function
+
+    return decorator
+
+
+def parse_default_keras_layer(keras_layer, input_names):
+    layer = {}
+
+    # Extract name for finding weights and biases
+    layer['name'] = keras_layer['config']['name']
+    layer['class_name'] = keras_layer['class_name']
+    if input_names is not None:
+        layer['inputs'] = input_names
+
+    layer['data_format'] = keras_layer['config'].get('data_format', 'channels_last')
+
+    if 'activation' in keras_layer['config']:
+        layer['activation'] = keras_layer['config']['activation']
+    if 'epsilon' in keras_layer['config']:
+        layer['epsilon'] = keras_layer['config']['epsilon']
+    if 'use_bias' in keras_layer['config']:
+        layer['use_bias'] = keras_layer['config']['use_bias']
+
+    return layer
+
+
+def get_model_arch(config):
+    if 'KerasModel' in config:
+        # Model instance passed in config from API
+        keras_model = config['KerasModel']
+        if isinstance(keras_model, str):
+            from tensorflow.keras.models import load_model
+
+            keras_model = load_model(keras_model)
+        model_arch = json.loads(keras_model.to_json())
+        reader = KerasModelReader(keras_model)
+    elif 'KerasJson' in config:
+        # Extract model architecture from json
+        with open(config['KerasJson']) as json_file:
+            model_arch = json.load(json_file)
+        reader = KerasFileReader(config)
+    elif 'KerasH5' in config:
+        # Model arch and weights are in H5 file (from model.save() function)
+        with h5py.File(config['KerasH5'], mode='r') as h5file:
+            # Load the configuration from h5 using json's decode
+            model_arch = h5file.attrs.get('model_config')
+            if model_arch is None:
+                raise ValueError('No model found in config file.')
+            else:
+                # model_arch is string by default since h5py 3.0.0, keeping this condition for compatibility.
+                if isinstance(model_arch, bytes):
+                    model_arch = model_arch.decode('utf-8')
+                model_arch = json.loads(model_arch)
+        reader = KerasFileReader(config)
+    else:
+        raise ValueError('No model found in config file.')
+
+    return model_arch, reader
+
+
+def parse_keras_model(model_arch, reader):
+    # This is a list of dictionaries to hold all the layer info we need to generate HLS
+    layer_list = []
+
+    # Define layers to skip for conversion to HLS
+    skip_layers = ['Dropout']
+    # Activation layers
+    activation_layers = [
+        'Activation',
+        'LeakyReLU',
+        'ThresholdedReLU',
+        'ELU',
+        'PReLU',
+        'Softmax',
+        'TernaryTanh',
+        'HardActivation',
+        'UnaryLUT',
+        'HGQ>UnaryLUT',
+    ]
+    # Recurrent layers
+    recurrent_layers = ['SimpleRNN', 'LSTM', 'GRU']
+    # All supported layers
+    supported_layers = get_supported_keras_layers() + skip_layers
+
+    # Map inputs of skipped and split (activation) layers
+    inputs_map = {}
+
+    # Loop through layers
+    layer_counter = 0
+
+    input_layers = None
+    output_layers = None
+
+    layer_config = None
+    if model_arch['class_name'] == 'Sequential':
+        print('Interpreting Sequential')
+        layer_config = model_arch['config']
+        if 'layers' in layer_config:  # Newer Keras versions have 'layers' in 'config' key
+            layer_config = layer_config['layers']
+        # Sequential doesn't have InputLayer in TF < 2.3 (Keras 2.4.0)
+        if layer_config[0]['class_name'] != 'InputLayer':
+            input_layer = {}
+            input_layer['name'] = 'input1'
+            input_layer['class_name'] = 'InputLayer'
+            input_layer['input_shape'] = layer_config[0]['config']['batch_input_shape'][1:]
+            layer_list.append(input_layer)
+            print('Input shape:', input_layer['input_shape'])
+    elif model_arch['class_name'] in ['Model', 'Functional']:  # TF >= 2.3 calls it 'Functional' API
+        print('Interpreting Model')
+        layer_config = model_arch['config']['layers']
+        input_layers = [inp[0] for inp in model_arch['config']['input_layers']]
+        output_layers = [out[0] for out in model_arch['config']['output_layers']]
+
+    # Get input shape and check for unsupported layer type
+    for keras_layer in layer_config:
+        if keras_layer['class_name'] not in supported_layers:
+            raise Exception('ERROR: Unsupported layer type: {}'.format(keras_layer['class_name']))
+
+    output_shapes = {}
+    output_shape = None
+
+    print('Topology:')
+    for keras_layer in layer_config:
+        if 'batch_input_shape' in keras_layer['config']:
+            if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
+                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
+            else:
+                input_shapes = [keras_layer['config']['batch_input_shape']]
+        else:
+            if 'inbound_nodes' in keras_layer:
+                input_shapes = [output_shapes[inbound_node[0]] for inbound_node in keras_layer['inbound_nodes'][0]]
+            else:
+                # Sequential model, so output_shape from the previous layer is still valid
+                input_shapes = [output_shape]
+
+        keras_class = keras_layer['class_name']
+
+        if keras_class in skip_layers:
+            if 'inbound_nodes' in keras_layer:
+                name = keras_layer['config']['name']
+                # Currently supported skipped layers have only one input
+                parent_input = keras_layer['inbound_nodes'][0][0][0]
+                # Skipped layers can follow each other (e.g., Dropout -> Flatten)
+                inputs_map[name] = inputs_map.get(parent_input, parent_input)
+
+            output_shapes[keras_layer['config']['name']] = input_shapes[0]
+
+            continue
+
+        if keras_class in supported_layers:
+            layer_counter = layer_counter + 1
+
+        # Extract inbound nodes
+        if 'inbound_nodes' in keras_layer and len(keras_layer['inbound_nodes']) > 0:
+            input_names = [inputs_map.get(inp[0], inp[0]) for inp in keras_layer['inbound_nodes'][0]]
+        else:
+            input_names = None
+
+        layer, output_shape = layer_handlers[keras_class](keras_layer, input_names, input_shapes, reader)
+
+        print(
+            'Layer name: {}, layer type: {}, input shapes: {}, output shape: {}'.format(
+                layer['name'], layer['class_name'], input_shapes, output_shape
+            )
+        )
+        layer_list.append(layer)
+        if 'activation' in layer and layer['class_name'] not in activation_layers + recurrent_layers:  # + qkeras_layers:
+            act_layer = {}
+            act_details = layer['activation']
+            # Workaround for QKeras activations passed as an argument
+            if isinstance(act_details, dict):
+                act_layer['class_name'] = 'QActivation'
+                act_layer['config'] = {
+                    'name': layer['name'] + '_' + act_details['class_name'],
+                    'activation': act_details,
+                }
+            else:
+                act_layer['class_name'] = 'Activation'
+                act_layer['config'] = {'name': layer['name'] + '_' + act_details, 'activation': act_details}
+            act_layer, output_shape = layer_handlers[act_layer['class_name']](act_layer, None, [output_shape], reader)
+            inputs_map[layer['name']] = act_layer['name']
+            if output_layers is not None and layer['name'] in output_layers:
+                output_layers = [act_layer['name'] if name == layer['name'] else name for name in output_layers]
+            output_shapes[act_layer['name']] = output_shape
+            layer_list.append(act_layer)
+
+        assert output_shape is not None
+
+        output_shapes[layer['name']] = output_shape
+
+    return layer_list, input_layers, output_layers, output_shapes
+
+
+def keras_to_hls(config):
+    model_arch, reader = get_model_arch(config)
+    layer_list, input_layers, output_layers, _ = parse_keras_model(model_arch, reader)
+    print('Creating HLS model')
+    hls_model = ModelGraph(config, layer_list, input_layers, output_layers)
+    return hls_model
diff --git a/hls4ml/converters/pytorch/core.py b/hls4ml/converters/pytorch/core.py
index e4cb00a310..e4d99fe286 100644
--- a/hls4ml/converters/pytorch/core.py
+++ b/hls4ml/converters/pytorch/core.py
@@ -31,10 +31,6 @@ def parse_linear_layer(operation, layer_name, input_names, input_shapes, node, c
 
     output_shape = input_shapes[0][:]
     output_shape[-1] = layer['n_out']
-    if len(input_shapes[0]) == 3:
-        layer['seq_len'] = output_shape[-1]
-    else:
-        layer['seq_len'] = 1
 
     return layer, output_shape
 
diff --git a/hls4ml/converters/pytorch/multiheadattention.py b/hls4ml/converters/pytorch/multiheadattention.py
deleted file mode 100644
index 7c53aeeb54..0000000000
--- a/hls4ml/converters/pytorch/multiheadattention.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import numpy as np
-
-from hls4ml.converters.pytorch_to_hls import pytorch_handler
-
-
-@pytorch_handler('MultiheadAttention')
-def parse_multiheadattention_layer(
-    operation, layer_name, input_names, input_shapes, node, class_object, data_reader, config
-):
-    assert 'MultiheadAttention' in operation
-    assert len(input_shapes) == 3
-
-    layer = {}
-
-    layer['class_name'] = 'MultiHeadAttention'
-    layer['name'] = layer_name
-    layer['inputs'] = input_names
-
-    layer['num_heads'] = class_object.num_heads
-    layer['head_dim_key'] = class_object.kdim // layer['num_heads']
-    layer['head_dim_value'] = class_object.vdim // layer['num_heads']
-    layer['query_shape'] = input_shapes[0]
-    layer['key_shape'] = input_shapes[1]
-    layer['value_shape'] = input_shapes[2]
-
-    if not (len(layer['query_shape']) == len(layer['key_shape']) == len(layer['value_shape']) == 3):
-        raise Exception('only 3D shapes for query, key, and value are currently supported by hls4ml')
-
-    layer['feature_dim'] = class_object.embed_dim
-    layer['seq_len'] = layer['query_shape'][-2]
-
-    output_shape = layer['query_shape']
-
-    layer['attention_axes'] = [1]
-    layer['softmax_axis'] = [3]
-
-    in_proj_weights = class_object.in_proj_weight.data.numpy()
-    in_proj_bias = class_object.in_proj_bias.data.numpy()
-
-    weight_data = np.split(in_proj_weights, [class_object.embed_dim, class_object.embed_dim + class_object.kdim], axis=0)
-    bias_data = np.split(in_proj_bias, [class_object.embed_dim, class_object.embed_dim + class_object.kdim], axis=0)
-
-    for weight_type, weight, bias in zip(['query', 'key', 'value'], weight_data, bias_data):
-        layer[f'{weight_type}_weight_data'] = weight.T.reshape(
-            layer['feature_dim'], layer['num_heads'], layer['head_dim_key']
-        ).transpose(1, 0, 2)
-        layer[f'{weight_type}_bias_data'] = bias.reshape(layer['num_heads'], layer['head_dim_key'])
-
-    layer['attention_output_weight_data'] = class_object.out_proj.weight.data.numpy().T.reshape(
-        layer['num_heads'], layer['head_dim_key'], layer['feature_dim']
-    )
-    layer['attention_output_bias_data'] = class_object.out_proj.bias.data.numpy()
-
-    return layer, output_shape
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index 8006586d23..8a0e19553f 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1586,54 +1586,6 @@ def initialize(self):
         self.add_output_variable([len(self.get_attr('expression'))], [f'N_OUTPUTS_{self.index}'], var_name='y')
 
 
-class MultiHeadAttention(Layer):
-    _expected_attributes = [
-        Attribute('num_heads'),
-        Attribute('head_dim_key'),
-        Attribute('head_dim_value'),
-        Attribute('feature_dim'),
-        Attribute('seq_len'),
-        WeightAttribute('attention_output_weight'),
-        WeightAttribute('attention_output_bias'),
-        WeightAttribute('key_weight'),
-        WeightAttribute('key_bias'),
-        WeightAttribute('query_weight'),
-        WeightAttribute('query_bias'),
-        WeightAttribute('value_weight'),
-        WeightAttribute('value_bias'),
-        TypeAttribute('attention_output_weight'),
-        TypeAttribute('attention_output_bias'),
-        TypeAttribute('key_weight'),
-        TypeAttribute('key_bias'),
-        TypeAttribute('query_weight'),
-        TypeAttribute('query_bias'),
-        TypeAttribute('value_weight'),
-        TypeAttribute('value_bias'),
-    ]
-
-    def initialize(self):
-        weights = [
-            'attention_output_weight',
-            'attention_output_bias',
-            'key_weight',
-            'key_bias',
-            'query_weight',
-            'query_bias',
-            'value_weight',
-            'value_bias',
-        ]
-
-        for w in weights:
-            data_name = f'{w}_data'
-            var_name = f'{w}{{index}}'
-            data = self.get_attr(data_name)
-            self.add_weights_variable(name=w, var_name=var_name, data=data)
-
-        shape = self.attributes['query_shape'][1:]
-        dims = [f'seq_out_{self.index}', f'feature_out_{self.index}']
-        self.add_output_variable(shape, dims)
-
-
 layer_map = {
     'Input': Input,
     'InputLayer': Input,
@@ -1700,7 +1652,6 @@ def initialize(self):
     'BatchNormOnnx': BatchNormOnnx,
     'LayerGroup': LayerGroup,
     'SymbolicExpression': SymbolicExpression,
-    'MultiHeadAttention': MultiHeadAttention,
     'LayerNormalization': LayerNormalization,
     # TensorFlow-specific layers:
     'BiasAdd': BiasAdd,
diff --git a/hls4ml/model/optimizer/passes/infer_precision.py b/hls4ml/model/optimizer/passes/infer_precision.py
index a729667776..af97b4ccdd 100644
--- a/hls4ml/model/optimizer/passes/infer_precision.py
+++ b/hls4ml/model/optimizer/passes/infer_precision.py
@@ -87,9 +87,6 @@ def _infer_precision(self, node, types_to_infer):
         if node_class in ['ParametrizedActivation']:
             return self._infer_par_act_precision(node, types_to_infer)
 
-        if node_class in ['MultiHeadAttention']:
-            return self._infer_mha_precision(node, types_to_infer)
-
         # What about quantized activation layer? Setting it to 'auto' manually will break it here. We should prevent
         # this in config_from_* functions
 
@@ -576,58 +573,3 @@ def _infer_par_act_precision(self, node, types_to_infer):
             inferred_types.append('param_t')
 
         return inferred_types
-
-    def _infer_mha_precision(self, node, types_to_infer):
-        inferred_types = []
-
-        for weightvar in (
-            'attention_output_weight',
-            'attention_output_bias',
-            'key_weight',
-            'key_bias',
-            'query_weight',
-            'query_bias',
-            'value_weight',
-            'value_bias',
-        ):
-            if f'{weightvar}_t' in types_to_infer:
-                self._infer_default_type(node, f'{weightvar}_t')
-                node.weights[weightvar].update_precision(node.types[f'{weightvar}_t'].precision)
-                inferred_types.append(f'{weightvar}_t')
-
-        if 'result_t' in types_to_infer:
-            input_precision = node.get_input_variable().type.precision
-            weight_precision = node.types['attention_output_weight_t'].precision
-            bias_precision = node.types['attention_output_bias_t'].precision
-
-            if self._all_supported_types((input_precision, weight_precision, bias_precision)):
-
-                after_weight_width = input_precision.width + weight_precision.width
-                after_weight_integer = input_precision.integer + weight_precision.integer
-                after_weight_signed = input_precision.signed or weight_precision.signed
-
-                out_signed = after_weight_signed or bias_precision.signed
-                out_integer = (
-                    max(
-                        after_weight_integer + (bias_precision.signed and not after_weight_signed),
-                        bias_precision.integer + (after_weight_signed and not bias_precision.signed),
-                    )
-                    + 1
-                )
-                out_width = out_integer + max(after_weight_width - after_weight_integer, bias_precision.fractional)
-
-                # Apply max precision constraints if specified in model config
-                max_precision = self._get_maximum_precision(node)
-                if max_precision is not None:
-                    out_width = min(out_width, max_precision.width)
-                    out_integer = min(out_integer, max_precision.integer)
-
-                out_precision = FixedPrecisionType(out_width, out_integer, out_signed)
-            else:
-                out_precision = self._get_default_precision(node)
-
-            node.types['result_t'].name = f'{node.name}_result_t'
-            node.types['result_t'].precision = out_precision
-            inferred_types.append('result_t')
-
-        return inferred_types
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
index d1f523fba7..4683239d85 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_activation.h
@@ -161,7 +161,7 @@ void init_invert_table(typename CONFIG_T::inv_table_t table_out[CONFIG_T::table_
     // The template data_T is the data type used to address the table
     for (unsigned i = 0; i < CONFIG_T::table_size; i++) {
         float x = softmax_real_val_from_idx<data_T, CONFIG_T>(i);
-        typename CONFIG_T::inv_table_t inv_x = 1.0 / x;
+        typename CONFIG_T::inv_table_t inv_x = 1 / x;
         table_out[i] = inv_x;
     }
 }
@@ -269,24 +269,24 @@ void softmax_stable(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
     }
 }
 
-template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::exp_table_t table_out[N_TABLE]) {
-    float exp_range = (float)CONFIG_T::exp_range;
+template <typename CONFIG_T, int N_TABLE> void init_exp_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
     for (int ii = 0; ii < N_TABLE; ii++) {
         // First, convert from table index to X-value (signed 8-bit, range -8 to +8)
         float in_val = 2 * 8.0 * (ii - float(N_TABLE) / 2.0) / float(N_TABLE);
         // Next, compute lookup table function
-        typename CONFIG_T::exp_table_t real_val = exp_fcn_float(in_val);
+        typename CONFIG_T::table_t real_val = exp_fcn_float(in_val);
         // std::cout << "Lookup table In Value: " << in_val << " Result: " << real_val << std::endl;
         table_out[ii] = real_val;
     }
 }
 
-template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::inv_table_t table_out[N_TABLE]) {
-    float inv_range = (float)CONFIG_T::inv_range;
+template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Inversion function:
     //   result = 1/x
     for (int ii = 0; ii < N_TABLE; ii++) {
-        float in_val = inv_range * ii / float(N_TABLE);
+        // First, convert from table index to X-value (signed 8-bit, range 0 to +64)
+        float in_val = 64.0 * ii / float(N_TABLE);
+        // Next, compute lookup table function
         if (in_val > 0.0)
             table_out[ii] = 1.0 / in_val;
         else
@@ -296,18 +296,15 @@ template <typename CONFIG_T, int N_TABLE> void init_invert_table_legacy(typename
 
 template <class data_T, class res_T, typename CONFIG_T>
 void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
-    #pragma HLS pipeline
-    int exp_range = CONFIG_T::exp_range;
-    int inv_range = CONFIG_T::inv_range;
     // Initialize the lookup table
 #ifdef __HLS_SYN__
     bool initialized = false;
-    typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
 #else
     static bool initialized = false;
-    static typename CONFIG_T::exp_table_t exp_table[CONFIG_T::table_size];
-    static typename CONFIG_T::inv_table_t invert_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t exp_table[CONFIG_T::table_size];
+    static typename CONFIG_T::table_t invert_table[CONFIG_T::table_size];
 #endif
     if (!initialized) {
         init_exp_table_legacy<CONFIG_T, CONFIG_T::table_size>(exp_table);
@@ -315,41 +312,45 @@ void softmax_legacy(data_T data[CONFIG_T::n_in], res_T res[CONFIG_T::n_in]) {
         initialized = true;
     }
 
+    #pragma HLS PIPELINE
+
     // Index into the lookup table based on data for exponentials
-    typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
-    typename CONFIG_T::exp_table_t exp_diff_res;            // different, independent, fixed point precision
-    typename CONFIG_T::exp_table_t data_cache[CONFIG_T::n_in];
+    typename CONFIG_T::table_t exp_res[CONFIG_T::n_in]; // different, independent, fixed point precision
+    typename CONFIG_T::table_t exp_diff_res;            // different, independent, fixed point precision
+    data_T data_cache[CONFIG_T::n_in];
     int data_round;
     int index;
-
-    #pragma HLS array_partition variable=data_cache complete
-
-    typename CONFIG_T::accum_t denominator;
-    typename CONFIG_T::inv_table_t deno_inver;
-
-    denominator = 0;
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        data_round = data[ii] * (CONFIG_T::table_size / (exp_range * 2));
-        index = data_round + exp_range * (CONFIG_T::table_size / (exp_range * 2));
-        if (index < 0)
-            index = 0;
-        if (index > CONFIG_T::table_size - 1)
-            index = CONFIG_T::table_size - 1;
-        denominator += exp_table[index];
-        data_cache[ii] = exp_table[index];
+        data_cache[ii] = data[ii];
+        exp_res[ii] = 0;
     }
 
-    // using lookup table for inverse
-    int exp_res_index = denominator * (CONFIG_T::table_size / inv_range);
-
-    if (exp_res_index < 0)
-        exp_res_index = 0;
-    if (exp_res_index > CONFIG_T::table_size - 1)
-        exp_res_index = CONFIG_T::table_size - 1;
-    deno_inver = invert_table[exp_res_index];
+    for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
+        for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
+            if (ii == jj)
+                exp_diff_res = 1;
+            else {
+                data_round = (data_cache[jj] - data_cache[ii]) * CONFIG_T::table_size / 16;
+                index = data_round + 8 * CONFIG_T::table_size / 16;
+                if (index < 0)
+                    index = 0;
+                if (index > CONFIG_T::table_size - 1)
+                    index = CONFIG_T::table_size - 1;
+                exp_diff_res = exp_table[index];
+            }
+            exp_res[ii] += exp_diff_res;
+        }
+    }
 
+    // Second loop to invert
     for (int ii = 0; ii < CONFIG_T::n_in; ii++) {
-        res[ii] = (res_T)(data_cache[ii] * deno_inver);
+        int exp_res_index = exp_res[ii] * CONFIG_T::table_size / 64;
+        if (exp_res_index < 0)
+            exp_res_index = 0;
+        if (exp_res_index > CONFIG_T::table_size - 1)
+            exp_res_index = CONFIG_T::table_size - 1;
+        // typename CONFIG_T::table_t exp_res_invert = invert_table[exp_res_index];
+        res[ii] = (res_T)invert_table[exp_res_index];
     }
 }
 
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h b/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
deleted file mode 100644
index 3543b0d007..0000000000
--- a/hls4ml/templates/vivado/nnet_utils/nnet_multiheadattention.h
+++ /dev/null
@@ -1,324 +0,0 @@
-#ifndef NNET_MHT_H_
-#define NNET_MHT_H_
-
-#include "hls_stream.h"
-#include "nnet_activation.h"
-#include "nnet_common.h"
-#include "nnet_dense.h"
-#include "nnet_mult.h"
-#include <iostream>
-#include <math.h>
-
-namespace nnet {
-
-struct multiheadattention_config {
-    // Internal data type definitions
-    typedef float bias_t;
-    typedef float weight_t;
-    typedef float accum_t;
-    typedef ap_fixed<16, 8> multi_t;
-
-    // Layer Sizes
-    static const unsigned num_heads = 10;
-    static const unsigned head_dim_key = 10;
-    static const unsigned head_dim_value = 10;
-    static const unsigned feature_dim = 20;
-    static const unsigned seq_len = 500;
-
-    // Resource reuse info
-    static const unsigned io_type = io_parallel;
-    static const unsigned strategy = latency;
-    static const unsigned reuse_factor = 1;
-    static const bool store_weights_in_bram = false;
-
-    template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
-};
-
-template <int PackSize, class data_T> struct datapack { data_T data[PackSize]; };
-
-template <class data_T, int size> void read_stream_array(hls::stream<data_T> data_in[size], data_T out[size]) {
-    for (int k = 0; k < size; ++k) {
-        #pragma HLS UNROLL
-        out[k] = data_in[k].read();
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void matrixmul_transpose(hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &Q,
-                         hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &K,
-                         res_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len]) // seq_Q, seq_K
-{
-    const data_T dk = 1.0 / sqrt(CONFIG_T::head_dim_key);
-    data_T QK_1;
-    typename CONFIG_T::accum_t QKij;
-    data_T Qi[CONFIG_T::head_dim_key];
-    data_T Product[CONFIG_T::seq_len]; // seq_Q, seq_K
-    res_T qk_smout[CONFIG_T::seq_len];
-    data_T krow[CONFIG_T::seq_len * CONFIG_T::head_dim_key];
-    #pragma HLS ARRAY_PARTITION variable=Qi complete
-    #pragma HLS ARRAY_PARTITION variable=Product complete
-    #pragma HLS ARRAY_PARTITION variable=qk_smout complete
-    #pragma HLS ARRAY_PARTITION variable=QK complete dim=2
-    #pragma HLS ARRAY_PARTITION variable=krow complete
-
-    datapack<CONFIG_T::head_dim_key, data_T> datak_pack, dataq_pack;
-    #pragma HLS DATA_PACK variable=Q
-    #pragma HLS DATA_PACK variable=K
-    #pragma HLS DATA_PACK variable=datak_pack
-    #pragma HLS DATA_PACK variable=dataq_pack
-
-    // int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_key) / float(CONFIG_T::reuse_factor));
-    // CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
-
-prep_k:
-    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        datak_pack = K.read();
-        for (int j = 0; j < CONFIG_T::head_dim_key; ++j) {
-            #pragma HLS UNROLL
-            krow[i * CONFIG_T::head_dim_key + j] = datak_pack.data[j];
-        }
-    }
-
-row:
-    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        dataq_pack = Q.read();
-
-    q:
-        for (int q_i = 0; q_i < CONFIG_T::head_dim_key; ++q_i) {
-            #pragma HLS UNROLL
-            Qi[q_i] = dataq_pack.data[q_i];
-        }
-    col:
-        for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-            QKij = 0;
-        product:
-            for (int k = 0; k < CONFIG_T::head_dim_key; ++k) {
-                QK_1 = CONFIG_T::template product<data_T, data_T>::product(Qi[k], krow[j * CONFIG_T::head_dim_key + k]);
-                QKij += QK_1;
-            }
-            Product[j] = QKij * dk;
-        }
-        softmax<data_T, res_T, typename CONFIG_T::softmax_config1>(Product, qk_smout);
-        for (int n = 0; n < CONFIG_T::seq_len; ++n) {
-            #pragma HLS UNROLL
-            QK[i][n] = qk_smout[n];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void matrixmul(data_T QK[CONFIG_T::seq_len][CONFIG_T::seq_len], hls::stream<datapack<CONFIG_T::head_dim_key, data_T>> &V,
-               hls::stream<res_T> S[CONFIG_T::head_dim_value]) // S: attention score
-{
-    #pragma HLS DATA_PACK variable=V
-    #pragma HLS ARRAY_PARTITION variable=QK complete dim=2
-    #pragma HLS ARRAY_PARTITION variable=S complete dim=1
-
-    datapack<CONFIG_T::head_dim_key, data_T> datav_pack;
-    #pragma HLS DATA_PACK variable=datav_pack
-
-    // int multiplier_limit = ceil(float(CONFIG_T::seq_len * CONFIG_T::head_dim_value) / float(CONFIG_T::reuse_factor));
-    // CONFIG_T::template product<data_T, typename CONFIG_T::weight_t>::limit(multiplier_limit);
-
-    data_T dataV[CONFIG_T::seq_len * CONFIG_T::head_dim_value];
-    #pragma HLS ARRAY_PARTITION variable = dataV complete dim = 1
-
-    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-        #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-        datav_pack = V.read();
-        for (int i = 0; i < CONFIG_T::head_dim_value; ++i) {
-            #pragma HLS UNROLL
-            dataV[CONFIG_T::seq_len * i + j] = datav_pack.data[i];
-        }
-    }
-
-    data_T Sij, S_1;
-    data_T QKi[CONFIG_T::seq_len];
-#pragma HLS ARRAY_Partition variable=QKi complete
-row:
-    for (int i = 0; i < CONFIG_T::seq_len; ++i) {
-    #pragma HLS PIPELINE II=CONFIG_T::reuse_factor
-    qk:
-        for (int q_i = 0; q_i < CONFIG_T::seq_len; ++q_i) {
-            #pragma HLS UNROLL
-            QKi[q_i] = QK[i][q_i];
-        }
-    col:
-        for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-            Sij = 0;
-        product:
-            for (int k = 0; k < CONFIG_T::seq_len; ++k) {
-                S_1 = CONFIG_T::template product<data_T, data_T>::product(QKi[k], dataV[j * CONFIG_T::seq_len + k]);
-                Sij += S_1;
-            }
-            S[j].write(Sij);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void lin_projection(hls::stream<data_T> data_q[CONFIG_T::feature_dim], hls::stream<data_T> data_vk[CONFIG_T::feature_dim],
-                    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> &k_proj,
-                    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> &q_proj,
-                    hls::stream<datapack<CONFIG_T::head_dim_value, res_T>> &v_proj,
-                    typename CONFIG_T::weight_t key_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-                    typename CONFIG_T::bias_t key_bias[CONFIG_T::head_dim_key],
-                    typename CONFIG_T::weight_t query_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_key],
-                    typename CONFIG_T::bias_t query_bias[CONFIG_T::head_dim_key],
-                    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::head_dim_value],
-                    typename CONFIG_T::bias_t value_bias[CONFIG_T::head_dim_value]) {
-    #pragma HLS DATA_PACK variable=k_proj
-    #pragma HLS DATA_PACK variable=q_proj
-    #pragma HLS DATA_PACK variable=v_proj
-
-    #pragma HLS ARRAY_PARTITION variable=data_q complete dim=1
-    #pragma HLS ARRAY_PARTITION variable=data_vk complete dim=1
-
-k_h:
-    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-        #pragma HLS PIPELINE
-
-        data_T proj_k[CONFIG_T::head_dim_key];
-        data_T proj_q[CONFIG_T::head_dim_key];
-        data_T proj_v[CONFIG_T::head_dim_value];
-        data_T in_q[CONFIG_T::feature_dim];
-        data_T in_v[CONFIG_T::feature_dim];
-        #pragma HLS ARRAY_PARTITION variable=proj_k complete dim=1
-        #pragma HLS ARRAY_PARTITION variable=proj_q complete dim=1
-        #pragma HLS ARRAY_PARTITION variable=proj_v complete dim=1
-        #pragma HLS ARRAY_PARTITION variable=in_q complete dim=1
-        #pragma HLS ARRAY_PARTITION variable=in_v complete dim=1
-
-        datapack<CONFIG_T::head_dim_key, res_T> proj_k_pack;
-        datapack<CONFIG_T::head_dim_key, res_T> proj_q_pack;
-        datapack<CONFIG_T::head_dim_value, res_T> proj_v_pack;
-        #pragma HLS DATA_PACK variable=proj_k_pack
-        #pragma HLS DATA_PACK variable=proj_q_pack
-        #pragma HLS DATA_PACK variable=proj_v_pack
-
-        read_stream_array<data_T, CONFIG_T::feature_dim>(data_q, in_q);
-        read_stream_array<data_T, CONFIG_T::feature_dim>(data_vk, in_v);
-
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_k_pack.data, key_weight, key_bias);
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_q, proj_q_pack.data, query_weight, query_bias);
-        dense<data_T, res_T, typename CONFIG_T::config_mult1>(in_v, proj_v_pack.data, value_weight, value_bias);
-
-        k_proj.write(proj_k_pack);
-        q_proj.write(proj_q_pack);
-        v_proj.write(proj_v_pack);
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void dense_out(hls::stream<data_T> data_in[CONFIG_T::num_heads][CONFIG_T::head_dim_value],
-               res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-               typename CONFIG_T::weight_t
-                   attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value * CONFIG_T::feature_dim],
-               typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim]) {
-    data_T mat_res_con[CONFIG_T::num_heads * CONFIG_T::head_dim_value];
-    res_T dense_out[CONFIG_T::feature_dim];
-#pragma HLS ARRAY_PARTITION variable=mat_res_con complete dim=1
-#pragma HLS ARRAY_PARTITION variable=dense_out complete dim=1
-output_dense:
-    for (int k = 0; k < CONFIG_T::seq_len; ++k) {
-
-        #pragma HLS PIPELINE
-        for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-            #pragma HLS UNROLL
-            for (int j = 0; j < CONFIG_T::head_dim_value; ++j) {
-                #pragma HLS UNROLL
-                mat_res_con[CONFIG_T::head_dim_value * i + j] = data_in[i][j].read();
-            }
-        }
-        dense<data_T, res_T, typename CONFIG_T::config_mult2>(mat_res_con, dense_out, attention_output_weight,
-                                                              attention_output_bias);
-        for (int i = 0; i < CONFIG_T::feature_dim; ++i) {
-            #pragma HLS UNROLL
-            res[CONFIG_T::feature_dim * k + i] = dense_out[i];
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void data_prep(data_T data[CONFIG_T::seq_len * CONFIG_T::feature_dim], hls::stream<data_T> d[CONFIG_T::feature_dim]) {
-    #pragma HLS ARRAY_PARTITION variable=d complete dim=1
-    for (int j = 0; j < CONFIG_T::seq_len; ++j) {
-        for (int k = 0; k < CONFIG_T::feature_dim; ++k) {
-            #pragma HLS UNROLL
-            d[k].write(data[j * CONFIG_T::feature_dim + k]);
-        }
-    }
-}
-
-template <class data_T, class res_T, typename CONFIG_T>
-void multiheadattention(
-    data_T data_q[CONFIG_T::seq_len * CONFIG_T::feature_dim], data_T data_vk[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    res_T res[CONFIG_T::seq_len * CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t attention_output_weight[CONFIG_T::num_heads * CONFIG_T::head_dim_value *
-                                                        CONFIG_T::feature_dim], // num_heads,head_size_v,dim
-    typename CONFIG_T::bias_t attention_output_bias[CONFIG_T::feature_dim],
-    typename CONFIG_T::weight_t
-        key_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // n_head,dim,head_dim
-    typename CONFIG_T::bias_t key_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t
-        query_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_key], // same shape as key
-    typename CONFIG_T::bias_t query_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_key],
-    typename CONFIG_T::weight_t value_weight[CONFIG_T::feature_dim * CONFIG_T::num_heads * CONFIG_T::head_dim_value],
-    typename CONFIG_T::bias_t value_bias[CONFIG_T::num_heads * CONFIG_T::head_dim_value]) {
-    hls::stream<data_T> d_value[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-    hls::stream<data_T> d_query[CONFIG_T::num_heads][CONFIG_T::feature_dim];
-    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> q_proj[CONFIG_T::num_heads];
-    hls::stream<datapack<CONFIG_T::head_dim_key, res_T>> k_proj[CONFIG_T::num_heads];
-    hls::stream<datapack<CONFIG_T::head_dim_value, res_T>> v_proj[CONFIG_T::num_heads];
-    res_T qk_mul[CONFIG_T::num_heads][CONFIG_T::seq_len][CONFIG_T::seq_len];
-    hls::stream<res_T> matr_out[CONFIG_T::num_heads][CONFIG_T::head_dim_value];
-
-    #pragma HLS DATAFLOW
-    #pragma HLS ARRAY_PARTITION variable=d_query complete dim=1
-    #pragma HLS ARRAY_PARTITION variable=v_proj complete dim=1
-    #pragma HLS ARRAY_PARTITION variable=q_proj complete dim=1
-    #pragma HLS ARRAY_PARTITION variable=k_proj complete dim=1
-    #pragma HLS ARRAY_PARTITION variable=qk_mul complete dim=1
-    #pragma HLS ARRAY_PARTITION variable=matr_out complete dim=1
-
-prepq:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        #pragma HLS UNROLL
-        nnet::data_prep<data_T, res_T, CONFIG_T>(data_q, d_query[i]);
-    }
-prepvk:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        #pragma HLS UNROLL
-        nnet::data_prep<data_T, res_T, CONFIG_T>(data_vk, d_value[i]);
-    }
-
-lin_proj:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        #pragma HLS UNROLL
-        nnet::lin_projection<data_T, res_T, CONFIG_T>(
-            d_query[i], d_value[i], k_proj[i], q_proj[i], v_proj[i],
-            key_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), key_bias + (CONFIG_T::head_dim_key * i),
-            query_weight + (CONFIG_T::head_dim_key * CONFIG_T::feature_dim * i), query_bias + (CONFIG_T::head_dim_key * i),
-            value_weight + (CONFIG_T::head_dim_value * CONFIG_T::feature_dim * i),
-            value_bias + (CONFIG_T::head_dim_value * i));
-    }
-
-maxtrixmul1:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        #pragma HLS UNROLL
-        nnet::matrixmul_transpose<res_T, res_T, CONFIG_T>(q_proj[i], k_proj[i], qk_mul[i]);
-    }
-
-maxtrixmul2:
-    for (int i = 0; i < CONFIG_T::num_heads; ++i) {
-        #pragma HLS UNROLL
-        nnet::matrixmul<res_T, res_T, CONFIG_T>(qk_mul[i], v_proj[i], matr_out[i]); // stream
-    }
-
-    nnet::dense_out<res_T, res_T, CONFIG_T>(matr_out, res, attention_output_weight, attention_output_bias);
-}
-} // namespace nnet
-
-#endif
diff --git a/test/pytest/test_multiheadattention.py b/test/pytest/test_multiheadattention.py
deleted file mode 100644
index 1052039791..0000000000
--- a/test/pytest/test_multiheadattention.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from pathlib import Path
-
-import numpy as np
-import pytest
-from tensorflow.keras import Model
-from tensorflow.keras.layers import Input, MultiHeadAttention
-
-import hls4ml
-
-test_root_path = Path(__file__).parent
-
-batch_size = 100
-seq_len = 10
-num_heads = 2
-key_dim = 4
-
-atol = 2e-2
-
-
-@pytest.fixture(scope='module')
-def query_data():
-    return np.random.rand(batch_size, seq_len, num_heads * key_dim)
-
-
-@pytest.fixture(scope='module')
-def key_value_data():
-    return np.random.rand(batch_size, seq_len, num_heads * key_dim)
-
-
-@pytest.fixture(scope='module')
-def model():
-    query_input = Input(shape=(seq_len, num_heads * key_dim))
-    key_value_input = Input(shape=(seq_len, num_heads * key_dim))
-    mha_layer = MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)(query_input, key_value_input)
-    model = Model(inputs=[query_input, key_value_input], outputs=mha_layer)
-    model.compile()
-    return model
-
-
-# Currently only Vivado in io_parallel mode is supported
-def test_multiheadattention(model, query_data, key_value_data):
-    config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado')
-    output_dir = str(test_root_path / 'hls4mlprj_multiheadattention_Vivado_io_parallel')
-    hls_model = hls4ml.converters.convert_from_keras_model(
-        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
-    )
-    hls_model.compile()
-
-    # Predict
-    y_keras = model.predict([query_data, key_value_data]).flatten()
-    y_hls = hls_model.predict([query_data, key_value_data]).flatten()
-    np.testing.assert_allclose(y_keras, y_hls, rtol=0, atol=atol, verbose=True)
diff --git a/test/pytest/test_multiheadattention_pytorch.py b/test/pytest/test_multiheadattention_pytorch.py
deleted file mode 100644
index 862a0784fc..0000000000
--- a/test/pytest/test_multiheadattention_pytorch.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from pathlib import Path
-
-import numpy as np
-import pytest
-import torch
-from torch import nn
-
-import hls4ml
-
-test_root_path = Path(__file__).parent
-
-batch_size = 100
-seq_len = 10
-num_heads = 2
-embed_dim = 8
-
-atol = 2e-2
-
-
-@pytest.fixture(scope='module')
-def query_data():
-    return np.random.rand(batch_size, seq_len, embed_dim)
-
-
-@pytest.fixture(scope='module')
-def key_value_data():
-    return np.random.rand(batch_size, seq_len, embed_dim)
-
-
-class MultiHeadAttentionModel(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.mha = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)
-
-    def forward(self, query, key, value):
-        output, _ = self.mha(query, key, value)
-        return output
-
-
-# Currently only Vivado in io_parallel mode is supported
-def test_multiheadattention(query_data, key_value_data):
-    model = MultiHeadAttentionModel()
-    model.eval()
-
-    config = hls4ml.utils.config_from_pytorch_model(
-        model,
-        [(seq_len, embed_dim), (seq_len, embed_dim), (seq_len, embed_dim)],
-        granularity='name',
-        backend='Vivado',
-        channels_last_conversion='off',
-        transpose_outputs=False,
-    )
-    output_dir = str(test_root_path / 'hls4mlprj_multiheadattention_pytorch_Vivado_io_parallel')
-    hls_model = hls4ml.converters.convert_from_pytorch_model(
-        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
-    )
-    hls_model.compile()
-
-    # Predict
-    y_pytorch = (
-        model(torch.Tensor(query_data), torch.Tensor(key_value_data), torch.Tensor(key_value_data))
-        .detach()
-        .numpy()
-        .flatten()
-    )
-    y_hls = hls_model.predict([query_data, key_value_data, key_value_data]).flatten()
-    np.testing.assert_allclose(y_pytorch, y_hls, rtol=0, atol=atol, verbose=True)
diff --git a/test/pytest/test_precision_parsing.py b/test/pytest/test_precision_parsing.py
deleted file mode 100644
index 2796bece57..0000000000
--- a/test/pytest/test_precision_parsing.py
+++ /dev/null
@@ -1,29 +0,0 @@
-import pytest
-
-import hls4ml
-
-
-@pytest.mark.parametrize(
-    'prec_pair',
-    [
-        ('ap_fixed<3, 2>', True),
-        ('ap_ufixed<3, 2>', False),
-        ('ac_fixed<3, 2, true>', True),
-        ('ac_fixed<3, 2, false>', False),
-        ('ac_fixed<3, 2, 1>', True),
-        ('ac_fixed<3, 2, 0>', False),
-        ('ap_int<3, 2>', True),
-        ('ap_uint<3>', False),
-        ('ac_int<3, TRue>', True),
-        ('ac_int<3, FALse>', False),
-        ('ac_int<3, 1>', True),
-        ('ac_int<3, 0>', False),
-    ],
-)
-def test_sign_parsing(prec_pair):
-    '''Test that convert_precision_string determines the signedness correctly'''
-    strprec = prec_pair[0]
-    signed = prec_pair[1]
-
-    evalprec = hls4ml.backends.fpga.fpga_backend.FPGABackend.convert_precision_string(strprec)
-    assert evalprec.signed == signed

From b5b82e20be49a3c3f18b09b0091d06cc79b6ee27 Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Mon, 9 Dec 2024 12:03:37 -0500
Subject: [PATCH 52/55] change to uniform LUT and fix precision

---
 hls4ml/backends/vivado/vivado_backend.py      |  4 +-
 .../vivado/nnet_utils/nnet_layernorm.h        | 56 +++++--------------
 test/pytest/test_layernorm.py                 |  2 +-
 test/pytest/test_layernorm_pytorch.py         |  2 +-
 4 files changed, 19 insertions(+), 45 deletions(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index 452e40ddbb..3b3aebf9f4 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -552,10 +552,10 @@ def init_softmax(self, layer):
     def init_layernormalization(self, layer):
         if 'table_t' not in layer.attributes:
             layer.set_attr(
-                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=30, integer=10))
+                'table_t', NamedType(name=layer.name + '_table_t', precision=FixedPrecisionType(width=16, integer=6))
             )
         if 'table_size' not in layer.attributes:
-            layer.set_attr('table_size', 1024)  # table size
+            layer.set_attr('table_size', 4096)  # table size
         if 'table_range' not in layer.attributes:
             layer.set_attr('table_range', 1.0)  # table range
         if 'mean_t' not in layer.attributes:
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index f504c8875e..d38f7982f5 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -28,49 +28,20 @@ struct layernorm_config {
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
 
-template <typename CONFIG_T, int N_TABLE>
-void init_invert_sqr_table(typename CONFIG_T::table_t table_in[N_TABLE], typename CONFIG_T::table_t table_out[N_TABLE]) {
+template<typename CONFIG_T, int N_TABLE>
+void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
+{
     // Inversion function:
     //   result = 1/sqrt(x)
-    // Use log spacing to get more precision at lower values
-    float log_min = log(CONFIG_T::epsilon);
-    float log_max = log(CONFIG_T::table_range);
-    float log_step = (log_max - log_min) / (float)(N_TABLE - 1);
-    float log_val = log_min;
+    float min_val = CONFIG_T::epsilon;
+    float max_val = CONFIG_T::table_range;
+    float step = max_val / (float)(N_TABLE);
     for (int ii = 0; ii < N_TABLE; ii++) {
-        float in_val = exp(log_val);
-        table_in[ii] = (typename CONFIG_T::table_t)in_val;
-        table_out[ii] = (typename CONFIG_T::table_t)(1.0 / sqrt(in_val));
-        log_val += log_step;
+        float in_val = min_val + step * ii;
+        table_out[ii] = (typename CONFIG_T::table_t)(1.0/sqrt(in_val));
     }
 }
 
-template <typename CONFIG_T>
-void lookup_invert_sqr(typename CONFIG_T::mean_t x, typename CONFIG_T::table_t &res,
-                       typename CONFIG_T::table_t table_in[CONFIG_T::table_size],
-                       typename CONFIG_T::table_t table_out[CONFIG_T::table_size]) {
-    if (x <= table_in[0]) {
-        res = table_out[0];
-        return;
-    } else if (x >= table_in[CONFIG_T::table_size - 1]) {
-        res = table_out[CONFIG_T::table_size - 1];
-        return;
-    }
-
-    #pragma HLS PIPELINE
-LAYERNORM_LOOKUP:
-    for (int i = 0; i < CONFIG_T::table_size - 1; i++) {
-        #pragma HLS UNROLL factor=4
-        if (x <= table_in[i + 1] && x >= table_in[i]) {
-            res = table_out[i];
-            return;
-        }
-    }
-
-    res = table_out[CONFIG_T::table_size - 1];
-    return;
-}
-
 template <class data_T, class res_T, typename CONFIG_T>
 void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CONFIG_T::n_in / CONFIG_T::seq_len],
                   typename CONFIG_T::scale_t scale[CONFIG_T::n_in / CONFIG_T::seq_len],
@@ -83,14 +54,12 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
 #ifdef __HLS_SYN__
     bool initialized = false;
     typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-    typename CONFIG_T::table_t index_table[CONFIG_T::table_size];
 #else
     static bool initialized = false;
     static typename CONFIG_T::table_t invert_sqr_table[CONFIG_T::table_size];
-    static typename CONFIG_T::table_t index_table[CONFIG_T::table_size];
 #endif
     if (!initialized) {
-        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(index_table, invert_sqr_table);
+        init_invert_sqr_table<CONFIG_T, CONFIG_T::table_size>(invert_sqr_table);
         initialized = true;
     }
 
@@ -118,7 +87,12 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
         sum_cache2 += diff;
     }
     var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
-    lookup_invert_sqr<CONFIG_T>(var + var_epsilon, deno_inver, index_table, invert_sqr_table);
+
+    int index = (var) * (CONFIG_T::table_size) * inv_range_inv;
+    if (CONFIG_T::table_range > 1) index = (var) * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
+    if (index < 0) index = 0;
+    if (index > CONFIG_T::table_size - 1) index = CONFIG_T::table_size - 1;
+    deno_inver = invert_sqr_table[index];
 
 LAYERNORM_1D_RESULT:
     for (int i = 0; i < dim; ++i) {
diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
index f180c4f318..f3f0a5731b 100644
--- a/test/pytest/test_layernorm.py
+++ b/test/pytest/test_layernorm.py
@@ -9,7 +9,7 @@
 
 test_root_path = Path(__file__).parent
 
-in_shape = (4, 5)
+in_shape = (10, 8)
 atol = 5e-2
 
 
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
index c553bb41f1..d61b0c4361 100644
--- a/test/pytest/test_layernorm_pytorch.py
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -9,7 +9,7 @@
 
 test_root_path = Path(__file__).parent
 
-in_shape = (4, 5)
+in_shape = (10, 8)
 atol = 5e-2
 
 

From 0f08e7a1305726a48fd8ebf6b9dd99179bd2c3b4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 17:10:04 +0000
Subject: [PATCH 53/55] [pre-commit.ci] auto fixes from pre-commit hooks

---
 .../vivado/nnet_utils/nnet_layernorm.h          | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
index d38f7982f5..17b0712342 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_layernorm.h
@@ -28,9 +28,7 @@ struct layernorm_config {
     template <class x_T, class y_T> using product = nnet::product::mult<x_T, y_T>;
 };
 
-template<typename CONFIG_T, int N_TABLE>
-void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
-{
+template <typename CONFIG_T, int N_TABLE> void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE]) {
     // Inversion function:
     //   result = 1/sqrt(x)
     float min_val = CONFIG_T::epsilon;
@@ -38,7 +36,7 @@ void init_invert_sqr_table(typename CONFIG_T::table_t table_out[N_TABLE])
     float step = max_val / (float)(N_TABLE);
     for (int ii = 0; ii < N_TABLE; ii++) {
         float in_val = min_val + step * ii;
-        table_out[ii] = (typename CONFIG_T::table_t)(1.0/sqrt(in_val));
+        table_out[ii] = (typename CONFIG_T::table_t)(1.0 / sqrt(in_val));
     }
 }
 
@@ -88,10 +86,13 @@ void layernorm_1d(data_T data[CONFIG_T::n_in / CONFIG_T::seq_len], res_T res[CON
     }
     var = CONFIG_T::template product<typename CONFIG_T::mean_t, typename CONFIG_T::mean_t>::product(sum_cache2, k_inv);
 
-    int index = (var) * (CONFIG_T::table_size) * inv_range_inv;
-    if (CONFIG_T::table_range > 1) index = (var) * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
-    if (index < 0) index = 0;
-    if (index > CONFIG_T::table_size - 1) index = CONFIG_T::table_size - 1;
+    int index = (var) * (CONFIG_T::table_size)*inv_range_inv;
+    if (CONFIG_T::table_range > 1)
+        index = (var) * (CONFIG_T::table_size) / (int)CONFIG_T::table_range;
+    if (index < 0)
+        index = 0;
+    if (index > CONFIG_T::table_size - 1)
+        index = CONFIG_T::table_size - 1;
     deno_inver = invert_sqr_table[index];
 
 LAYERNORM_1D_RESULT:

From cbd88bde8dbf6fb42498f48816a509ffcdc7c9e6 Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Mon, 6 Jan 2025 10:23:37 -0500
Subject: [PATCH 54/55] fix encodings issue with dos2unix

---
 hls4ml/converters/keras/core.py |  348 ++++----
 hls4ml/model/profiling.py       | 1426 +++++++++++++++----------------
 2 files changed, 887 insertions(+), 887 deletions(-)

diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index 6e4e1ebb10..47148ee9f8 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -1,174 +1,174 @@
-from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer
-from hls4ml.model.quantizers import BinaryQuantizer, TernaryQuantizer
-from hls4ml.model.types import IntegerPrecisionType
-
-
-@keras_handler('InputLayer')
-def parse_input_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert keras_layer['class_name'] == 'InputLayer'
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    layer['input_shape'] = keras_layer['config']['batch_input_shape'][1:]
-
-    dtype = keras_layer['config']['dtype']
-    if dtype.startswith('int') or dtype.startswith('uint'):
-        layer['type_name'] = 'integer_input_t'
-        width = int(dtype[dtype.index('int') + 3 :])
-        signed = not dtype.startswith('u')
-        layer['precision'] = IntegerPrecisionType(width=width, signed=signed)
-    # elif bool, q[u]int, ...
-
-    output_shape = keras_layer['config']['batch_input_shape']
-
-    return layer, output_shape
-
-
-dense_layers = ['Dense', 'BinaryDense', 'TernaryDense']
-
-
-@keras_handler(*dense_layers)
-def parse_dense_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Dense' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    layer['weight_data'], layer['bias_data'] = get_weights_data(data_reader, layer['name'], ['kernel', 'bias'])
-    layer['n_in'] = layer['weight_data'].shape[0]
-    layer['n_out'] = layer['weight_data'].shape[1]
-    if 'Binary' in layer['class_name']:
-        layer['weight_quantizer'] = BinaryQuantizer(bits=2)
-        layer['bias_quantizer'] = BinaryQuantizer(bits=2)
-    elif 'Ternary' in layer['class_name']:
-        layer['weight_quantizer'] = TernaryQuantizer()
-        layer['bias_quantizer'] = TernaryQuantizer()
-    else:
-        layer['weight_quantizer'] = None
-        layer['bias_quantizer'] = None
-    output_shape = input_shapes[0][:]
-    output_shape[-1] = layer['n_out']
-
-    return layer, output_shape
-
-
-activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'ReLU']
-
-
-@keras_handler(*activation_layers)
-def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert keras_layer['class_name'] in activation_layers
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    if layer['class_name'] != 'Activation':
-        layer['activation'] = layer['class_name']
-
-    if layer['activation'] == 'elu':
-        layer['class_name'] = 'ELU'  # always use ELU type for elu, even if passed as activation
-
-    if layer['class_name'] == 'LeakyReLU':
-        # the name changes for version 3
-        layer['activ_param'] = keras_layer['config'].get('negative_slope', keras_layer['config'].get('alpha', 0.3))
-    elif layer['class_name'] == 'ThresholdedReLU':
-        layer['activ_param'] = keras_layer['config'].get('theta', 1.0)
-    elif layer['class_name'] == 'ELU':
-        layer['activ_param'] = keras_layer['config'].get('alpha', 1.0)
-    elif layer['class_name'] == 'ReLU':
-        layer['class_name'] = 'Activation'
-    elif layer['class_name'] == 'PReLU':
-        layer['param_data'] = get_weights_data(data_reader, layer['name'], 'alpha')
-
-    if layer['class_name'] == 'Activation' and layer['activation'] == 'softmax':
-        layer['class_name'] = 'Softmax'
-    if layer['class_name'] == 'Activation' and layer['activation'] == 'hard_sigmoid':
-        layer['class_name'] = 'HardActivation'
-    if layer['class_name'] == 'Softmax':
-        layer['axis'] = keras_layer['config'].get('axis', -1)
-    if layer['class_name'] == 'Activation' and layer['activation'] == 'leaky_relu':
-        layer['class_name'] = 'LeakyReLU'
-        # The parameter name changes for API v3; the default is different than in LeakyReLU layer
-        layer['activ_param'] = keras_layer['config'].get('negative_slope', keras_layer['config'].get('alpha', 0.2))
-
-    return layer, [shape for shape in input_shapes[0]]
-
-
-@keras_handler('BatchNormalization')
-def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    in_size = 1
-    for dim in input_shapes[0][1:]:
-        in_size *= dim
-    layer['n_in'] = in_size
-    layer['n_out'] = layer['n_in']
-    if len(input_shapes[0]) == 2:
-        layer['n_filt'] = -1
-    elif len(input_shapes[0]) == 3:
-        layer['n_filt'] = input_shapes[0][2]
-    elif len(input_shapes[0]) == 4:
-        layer['n_filt'] = input_shapes[0][3]
-
-    layer['use_gamma'] = keras_layer['config']['scale']
-    if layer['use_gamma']:
-        layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
-    else:
-        layer['gamma_data'] = 1
-
-    layer['use_beta'] = keras_layer['config']['center']
-    if layer['use_beta']:
-        layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
-    else:
-        layer['beta_data'] = 0
-
-    layer['mean_data'], layer['variance_data'] = get_weights_data(
-        data_reader, layer['name'], ['moving_mean', 'moving_variance']
-    )
-
-    return layer, [shape for shape in input_shapes[0]]
-
-
-@keras_handler('LayerNormalization')
-def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'LayerNormalization' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    in_size = 1
-    for dim in input_shapes[0][1:]:
-        in_size *= dim
-    layer['n_in'] = layer['n_out'] = in_size
-
-    if not ((len(input_shapes[0])) == 3):
-        raise Exception('input size is not currently supported by hls4ml, only dim3 is supported')
-    layer['seq_len'] = input_shapes[0][-2]
-
-    if not (keras_layer['config']['axis'][0] == 2):
-        raise Exception('assigning the axis is not currently supported by hls4ml, only axis 2 is supported')
-
-    layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
-    layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
-
-    layer['epsilon'] = keras_layer['config']['epsilon']
-    if layer['epsilon'] <= 0:
-        raise Exception('epsilon must be positive')
-
-    return layer, [shape for shape in input_shapes[0]]
-
-
-@keras_handler('Embedding')
-def parse_embedding_layer(keras_layer, input_names, input_shapes, data_reader):
-    assert 'Embedding' in keras_layer['class_name']
-
-    layer = parse_default_keras_layer(keras_layer, input_names)
-
-    layer['n_in'] = input_shapes[0][1]
-    layer['vocab_size'] = keras_layer['config']['input_dim']
-    layer['n_out'] = keras_layer['config']['output_dim']
-
-    layer['embeddings_data'] = get_weights_data(data_reader, layer['name'], 'embeddings')
-
-    output_shape = input_shapes[0] + [layer['n_out']]
-
-    return layer, output_shape
+from hls4ml.converters.keras_to_hls import get_weights_data, keras_handler, parse_default_keras_layer
+from hls4ml.model.quantizers import BinaryQuantizer, TernaryQuantizer
+from hls4ml.model.types import IntegerPrecisionType
+
+
+@keras_handler('InputLayer')
+def parse_input_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert keras_layer['class_name'] == 'InputLayer'
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    layer['input_shape'] = keras_layer['config']['batch_input_shape'][1:]
+
+    dtype = keras_layer['config']['dtype']
+    if dtype.startswith('int') or dtype.startswith('uint'):
+        layer['type_name'] = 'integer_input_t'
+        width = int(dtype[dtype.index('int') + 3 :])
+        signed = not dtype.startswith('u')
+        layer['precision'] = IntegerPrecisionType(width=width, signed=signed)
+    # elif bool, q[u]int, ...
+
+    output_shape = keras_layer['config']['batch_input_shape']
+
+    return layer, output_shape
+
+
+dense_layers = ['Dense', 'BinaryDense', 'TernaryDense']
+
+
+@keras_handler(*dense_layers)
+def parse_dense_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Dense' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    layer['weight_data'], layer['bias_data'] = get_weights_data(data_reader, layer['name'], ['kernel', 'bias'])
+    layer['n_in'] = layer['weight_data'].shape[0]
+    layer['n_out'] = layer['weight_data'].shape[1]
+    if 'Binary' in layer['class_name']:
+        layer['weight_quantizer'] = BinaryQuantizer(bits=2)
+        layer['bias_quantizer'] = BinaryQuantizer(bits=2)
+    elif 'Ternary' in layer['class_name']:
+        layer['weight_quantizer'] = TernaryQuantizer()
+        layer['bias_quantizer'] = TernaryQuantizer()
+    else:
+        layer['weight_quantizer'] = None
+        layer['bias_quantizer'] = None
+    output_shape = input_shapes[0][:]
+    output_shape[-1] = layer['n_out']
+
+    return layer, output_shape
+
+
+activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'ReLU']
+
+
+@keras_handler(*activation_layers)
+def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert keras_layer['class_name'] in activation_layers
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    if layer['class_name'] != 'Activation':
+        layer['activation'] = layer['class_name']
+
+    if layer['activation'] == 'elu':
+        layer['class_name'] = 'ELU'  # always use ELU type for elu, even if passed as activation
+
+    if layer['class_name'] == 'LeakyReLU':
+        # the name changes for version 3
+        layer['activ_param'] = keras_layer['config'].get('negative_slope', keras_layer['config'].get('alpha', 0.3))
+    elif layer['class_name'] == 'ThresholdedReLU':
+        layer['activ_param'] = keras_layer['config'].get('theta', 1.0)
+    elif layer['class_name'] == 'ELU':
+        layer['activ_param'] = keras_layer['config'].get('alpha', 1.0)
+    elif layer['class_name'] == 'ReLU':
+        layer['class_name'] = 'Activation'
+    elif layer['class_name'] == 'PReLU':
+        layer['param_data'] = get_weights_data(data_reader, layer['name'], 'alpha')
+
+    if layer['class_name'] == 'Activation' and layer['activation'] == 'softmax':
+        layer['class_name'] = 'Softmax'
+    if layer['class_name'] == 'Activation' and layer['activation'] == 'hard_sigmoid':
+        layer['class_name'] = 'HardActivation'
+    if layer['class_name'] == 'Softmax':
+        layer['axis'] = keras_layer['config'].get('axis', -1)
+    if layer['class_name'] == 'Activation' and layer['activation'] == 'leaky_relu':
+        layer['class_name'] = 'LeakyReLU'
+        # The parameter name changes for API v3; the default is different than in LeakyReLU layer
+        layer['activ_param'] = keras_layer['config'].get('negative_slope', keras_layer['config'].get('alpha', 0.2))
+
+    return layer, [shape for shape in input_shapes[0]]
+
+
+@keras_handler('BatchNormalization')
+def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    in_size = 1
+    for dim in input_shapes[0][1:]:
+        in_size *= dim
+    layer['n_in'] = in_size
+    layer['n_out'] = layer['n_in']
+    if len(input_shapes[0]) == 2:
+        layer['n_filt'] = -1
+    elif len(input_shapes[0]) == 3:
+        layer['n_filt'] = input_shapes[0][2]
+    elif len(input_shapes[0]) == 4:
+        layer['n_filt'] = input_shapes[0][3]
+
+    layer['use_gamma'] = keras_layer['config']['scale']
+    if layer['use_gamma']:
+        layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
+    else:
+        layer['gamma_data'] = 1
+
+    layer['use_beta'] = keras_layer['config']['center']
+    if layer['use_beta']:
+        layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
+    else:
+        layer['beta_data'] = 0
+
+    layer['mean_data'], layer['variance_data'] = get_weights_data(
+        data_reader, layer['name'], ['moving_mean', 'moving_variance']
+    )
+
+    return layer, [shape for shape in input_shapes[0]]
+
+
+@keras_handler('LayerNormalization')
+def parse_layernorm_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'LayerNormalization' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    in_size = 1
+    for dim in input_shapes[0][1:]:
+        in_size *= dim
+    layer['n_in'] = layer['n_out'] = in_size
+
+    if not ((len(input_shapes[0])) == 3):
+        raise Exception('input size is not currently supported by hls4ml, only dim3 is supported')
+    layer['seq_len'] = input_shapes[0][-2]
+
+    if not (keras_layer['config']['axis'][0] == 2):
+        raise Exception('assigning the axis is not currently supported by hls4ml, only axis 2 is supported')
+
+    layer['gamma_data'] = get_weights_data(data_reader, layer['name'], 'gamma')
+    layer['beta_data'] = get_weights_data(data_reader, layer['name'], 'beta')
+
+    layer['epsilon'] = keras_layer['config']['epsilon']
+    if layer['epsilon'] <= 0:
+        raise Exception('epsilon must be positive')
+
+    return layer, [shape for shape in input_shapes[0]]
+
+
+@keras_handler('Embedding')
+def parse_embedding_layer(keras_layer, input_names, input_shapes, data_reader):
+    assert 'Embedding' in keras_layer['class_name']
+
+    layer = parse_default_keras_layer(keras_layer, input_names)
+
+    layer['n_in'] = input_shapes[0][1]
+    layer['vocab_size'] = keras_layer['config']['input_dim']
+    layer['n_out'] = keras_layer['config']['output_dim']
+
+    layer['embeddings_data'] = get_weights_data(data_reader, layer['name'], 'embeddings')
+
+    output_shape = input_shapes[0] + [layer['n_out']]
+
+    return layer, output_shape
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 519e8fabc7..a47c1647f7 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -1,713 +1,713 @@
-import json
-import os
-import shutil
-import uuid
-from collections import defaultdict
-
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas
-import seaborn as sb
-
-from hls4ml.model.graph import ModelGraph
-from hls4ml.model.layers import GRU, LSTM, SeparableConv1D, SeparableConv2D
-
-try:
-    import qkeras
-    from tensorflow import keras
-
-    __tf_profiling_enabled__ = True
-except ImportError:
-    __tf_profiling_enabled__ = False
-
-try:
-    import torch
-
-    __torch_profiling_enabled__ = True
-except ImportError:
-    __torch_profiling_enabled__ = False
-
-
-def get_unoptimized_hlsmodel(model):
-    from hls4ml.converters import convert_from_config
-
-    new_config = model.config.config.copy()
-    new_config['HLSConfig'] = json.loads(json.dumps(new_config['HLSConfig']))
-
-    new_output_dir = uuid.uuid4().hex
-
-    while os.path.exists(new_output_dir):
-        new_output_dir = uuid.uuid4().hex
-
-    if 'SkipOptimizers' in new_config['HLSConfig']:
-        del new_config['HLSConfig']['SkipOptimizers']
-
-    new_config['HLSConfig']['Optimizers'] = []
-    new_config['OutputDir'] = new_output_dir
-
-    return convert_from_config(new_config), new_output_dir
-
-
-def array_to_summary(x, fmt='boxplot'):
-    if fmt == 'boxplot':
-        y = {'med': np.median(x), 'q1': np.percentile(x, 25), 'q3': np.percentile(x, 75), 'whislo': min(x), 'whishi': max(x)}
-    elif fmt == 'histogram':
-        # Power of 2 bins covering data range
-        high = np.ceil(np.log2(max(x))) + 1
-        low = np.floor(np.log2(min(x))) - 1
-        bits = np.arange(low, high, 1)
-        bins = 2**bits
-        h, b = np.histogram(x, bins=bins)
-        h = h * 1.0 / float(sum(h))  # normalize
-        y = {'h': h, 'b': np.log2(b)}
-    return y
-
-
-def boxplot(data, fmt='longform'):
-    if fmt == 'longform':
-        f = plt.figure()  # figsize=(3, 3))
-        hue = 'layer' if 'layer' in data.keys() else None
-        vp = sb.boxplot(x='x', y='weight', hue=hue, data=data[data['x'] > 0], showfliers=False)
-        vp.set_yticklabels(vp.get_yticklabels(), rotation=45, ha='right')
-        if hue is not None:
-            vp.get_legend().remove()
-        vp.set_xscale('log', base=2)
-        return f
-    elif fmt == 'summary':
-        from matplotlib.patches import Rectangle
-
-        medianprops = dict(linestyle='-', color='k')
-        f, ax = plt.subplots(1, 1)
-        data.reverse()
-        colors = sb.color_palette("Blues", len(data))
-        bp = ax.bxp(data, showfliers=False, vert=False, medianprops=medianprops)
-        # add colored boxes
-        for line, color in zip(bp['boxes'], colors):
-            x = line.get_xdata()
-            xl, xh = min(x), max(x)
-            y = line.get_ydata()
-            yl, yh = min(y), max(y)
-            rect = Rectangle((xl, yl), (xh - xl), (yh - yl), fill=True, color=color)
-            ax.add_patch(rect)
-        ax.set_yticklabels([d['weight'] for d in data])
-        ax.set_xscale('log', base=2)
-        plt.xlabel('x')
-        return f
-    else:
-        return None
-
-
-def histogram(data, fmt='longform'):
-    f = plt.figure()
-    from matplotlib.ticker import MaxNLocator
-
-    n = len(data) if fmt == 'summary' else len(data['weight'].unique())
-    colors = sb.color_palette("husl", n)
-    if fmt == 'longform':
-        for i, weight in enumerate(data['weight'].unique()):
-            y = array_to_summary(data[data['weight'] == weight]['x'], fmt='histogram')
-            plt.bar(y['b'][:-1], y['h'], width=1, fill=False, label=weight, edgecolor=colors[i])
-    elif fmt == 'summary':
-        for i, weight in enumerate(data):
-            plt.bar(weight['b'][:-1], weight['h'], width=1, fill=False, label=weight['weight'], edgecolor=colors[i])
-
-    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
-    plt.xlabel('log2(x)')
-    plt.ylabel('frequency')
-    plt.legend()
-    return f
-
-
-plots = {'boxplot': boxplot, 'histogram': histogram}
-
-
-def types_boxplot(data, fmt='longform'):
-    from matplotlib.patches import PathPatch, Rectangle
-
-    ax = plt.gca()
-    _ = plt.gcf()
-    # Scale the data
-    data['low'] = 2.0 ** data['low']
-    data['high'] = 2.0 ** data['high']
-
-    # Plot the custom precisions
-    ticks = np.array([tick.get_text() for tick in plt.yticks()[1]])
-    # Get the coordinates of the boxes to place the markers
-    if fmt == 'longform':
-        # seaborn adjusts the box positions slightly in groups
-        boxes = [c.get_extents().inverse_transformed(ax.transData) for c in ax.get_children() if isinstance(c, PathPatch)]
-        ys = [(box.y0 + box.y1) / 2 for box in boxes]
-        ys = [(y, y) for y in ys]
-    elif fmt == 'summary':
-        ys = [(y, y) for y in plt.yticks()[0]]
-    for _irow, row in data[data['layer'] != 'model'].iterrows():
-        if row['layer'] in ticks:
-            iy = np.argwhere(ticks == row['layer'])[0][0]  # Determine which layer in the plot
-            rectangle = Rectangle(
-                (row['low'], ys[iy][0] - 0.4), row['high'] - row['low'], 0.8, fill=True, color='grey', alpha=0.2
-            )
-            ax.add_patch(rectangle)
-
-
-def types_histogram(data, fmt='longform'):
-    ax = plt.gca()
-    layers = np.array(ax.get_legend_handles_labels()[1])
-    colors = sb.color_palette("husl", len(layers))
-    ylim = ax.get_ylim()
-    for _irow, row in data[data['layer'] != 'model'].iterrows():
-        if row['layer'] in layers:
-            col = colors[np.argwhere(layers == row['layer'])[0][0]]
-            plt.plot((row['low'], row['low']), ylim, '--', color=col)
-            plt.plot((row['high'], row['high']), ylim, '--', color=col)
-
-
-types_plots = {'boxplot': types_boxplot, 'histogram': types_histogram}
-
-
-def ap_fixed_WIFS(dtype):
-    from hls4ml.backends import VivadoBackend
-
-    dtype = VivadoBackend.convert_precision_string(dtype)
-    W, I, F, S = dtype.width, dtype.integer, dtype.fractional, dtype.signed
-    return W, I, F, S
-
-
-def types_hlsmodel(model):
-    data = {'layer': [], 'low': [], 'high': []}
-    # Plot the default precision
-    default_precision = model.config.model_precision['default']
-    W, I, F, S = ap_fixed_WIFS(default_precision)
-    data['layer'].append('model')
-    data['low'].append(-F)
-    data['high'].append(I - 1 if S else I)
-
-    for layer in model.get_layers():
-        if isinstance(layer, GRU) or isinstance(layer, LSTM):
-            suffix = ['w', 'rw', 'b', 'rb']
-        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
-            suffix = ['dw', 'pw', 'db', 'pb']
-        else:
-            suffix = ['w', 'b']
-        for iw, weight in enumerate(layer.get_weights()):
-            wname = f'{layer.name}/{suffix[iw]}'
-            T = weight.type
-            if T.name != 'model':
-                W, I, F, S = ap_fixed_WIFS(T.precision)
-                data['layer'].append(wname)
-                data['low'].append(-F)
-                data['high'].append(I - 1 if S else I)
-    data = pandas.DataFrame(data)
-    return data
-
-
-def activation_types_hlsmodel(model):
-    data = {'layer': [], 'low': [], 'high': []}
-    # Get the default precision
-    default_precision = model.config.model_precision['default']
-    W, I, F, S = ap_fixed_WIFS(default_precision)
-    data['layer'].append('model')
-    data['low'].append(-F)
-    data['high'].append(I - 1 if S else I)
-    for layer in model.get_layers():
-        T = layer.get_output_variable().type.precision
-        W, I, F, S = ap_fixed_WIFS(T)
-        data['layer'].append(layer.name)
-        data['low'].append(-F)
-        data['high'].append(I - 1 if S else I)
-    data = pandas.DataFrame(data)
-    return data
-
-
-def weights_hlsmodel(model, fmt='longform', plot='boxplot'):
-    if fmt == 'longform':
-        data = {'x': [], 'layer': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-
-    for layer in model.get_layers():
-        if isinstance(layer, GRU) or isinstance(layer, LSTM):
-            suffix = ['w', 'rw', 'b', 'rb']
-        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
-            suffix = ['dw', 'pw', 'db', 'pb']
-        else:
-            suffix = ['w', 'b']
-        name = layer.name
-        for iw, weight in enumerate(layer.get_weights()):
-            label = f'{name}/{suffix[iw]}'
-            w = weight.data.flatten()
-            w = abs(w[w != 0])
-            n = len(w)
-            if n == 0:
-                print(f'Weights for {name} are only zeros, ignoring.')
-                break
-            if fmt == 'longform':
-                data['x'].extend(w.tolist())
-                data['layer'].extend([name] * len(w))
-                data['weight'].extend([label] * len(w))
-            elif fmt == 'summary':
-                data.append(array_to_summary(w, fmt=plot))
-                data[-1]['layer'] = name
-                data[-1]['weight'] = label
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def _keras_batchnorm(layer):
-    weights = layer.get_weights()
-    epsilon = layer.epsilon
-
-    gamma = weights[0]
-    beta = weights[1]
-    mean = weights[2]
-    var = weights[3]
-
-    scale = gamma / np.sqrt(var + epsilon)
-    bias = beta - gamma * mean / np.sqrt(var + epsilon)
-
-    return [scale, bias], ['s', 'b']
-
-
-def _keras_layer(layer):
-    return layer.get_weights(), ['w', 'b']
-
-
-def _keras_layernorm(layer):
-    weights = layer.get_weights()
-
-    gamma = weights[0]
-    beta = weights[1]
-
-    scale = gamma
-    bias = beta
-
-    return [scale, bias], ['s', 'b']
-
-
-def _keras_lstm(layer):
-    return layer.get_weights(), ['w', 'u', 'b']
-
-
-keras_process_layer_map = defaultdict(
-    lambda: _keras_layer,
-    {
-        'BatchNormalization': _keras_batchnorm,
-        'QBatchNormalization': _keras_batchnorm,
-        'LayerNormalization': _keras_layernorm,
-        'LSTM': _keras_lstm,
-        'QLSTM': _keras_lstm,
-    },
-)
-
-
-def activations_hlsmodel(model, X, fmt='summary', plot='boxplot'):
-    if fmt == 'longform':
-        raise NotImplementedError
-    elif fmt == 'summary':
-        data = []
-
-    _, trace = model.trace(np.ascontiguousarray(X))
-
-    if len(trace) == 0:
-        raise RuntimeError("ModelGraph must have tracing on for at least 1 layer (this can be set in its config)")
-
-    for layer in trace.keys():
-        print(f"   {layer}")
-
-        if fmt == 'summary':
-            y = trace[layer].flatten()
-            y = abs(y[y != 0])
-
-            if len(y) == 0:
-                print(f'Activations for {layer} are only zeros, ignoring.')
-                continue
-
-            data.append(array_to_summary(y, fmt=plot))
-            data[-1]['weight'] = layer
-
-    return data
-
-
-def weights_keras(model, fmt='longform', plot='boxplot'):
-    if fmt == 'longform':
-        data = {'x': [], 'layer': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-    for layer in model.layers:
-        name = layer.name
-        weights, suffix = keras_process_layer_map[type(layer).__name__](layer)
-
-        for i, w in enumerate(weights):
-            label = f'{name}/{suffix[i]}'
-            w = w.flatten()
-            w = abs(w[w != 0])
-            n = len(w)
-            if n == 0:
-                print(f'Weights for {name} are only zeros, ignoring.')
-                break
-            if fmt == 'longform':
-                data['x'].extend(w.tolist())
-                data['layer'].extend([name] * n)
-                data['weight'].extend([label] * n)
-            elif fmt == 'summary':
-                data.append(array_to_summary(w, fmt=plot))
-                data[-1]['layer'] = name
-                data[-1]['weight'] = label
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def activations_keras(model, X, fmt='longform', plot='boxplot'):
-    # test layer by layer on data
-    if fmt == 'longform':
-        # return long form pandas dataframe for
-        # seaborn boxplot
-        data = {'x': [], 'weight': []}
-    elif fmt == 'summary':
-        # return summary statistics for matplotlib.axes.Axes.bxp
-        # or histogram bin edges and heights
-        data = []
-    outputs = _get_outputs(
-        [layer for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], X, model.input
-    )
-    outputs = dict(zip([layer.name for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], outputs))
-    for layer_name, y in outputs.items():
-        print(f"   {layer_name}")
-        y = y.flatten()
-        y = abs(y[y != 0])
-        if len(y) == 0:
-            print(f'Activations for {layer_name} are only zeros, ignoring.')
-            continue
-        if fmt == 'longform':
-            data['x'].extend(y.tolist())
-            data['weight'].extend([layer_name for i in range(len(y))])
-        elif fmt == 'summary':
-            data.append(array_to_summary(y, fmt=plot))
-            data[-1]['weight'] = layer_name
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def weights_torch(model, fmt='longform', plot='boxplot'):
-    suffix = ['w', 'b']
-    if fmt == 'longform':
-        data = {'x': [], 'layer': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-    for layer in model.children():
-        if isinstance(layer, torch.nn.Linear):
-            name = layer.__class__.__name__
-            weights = list(layer.parameters())
-            for i, w in enumerate(weights):
-                label = f'{name}/{suffix[i]}'
-                w = weights[i].detach().numpy()
-                w = w.flatten()
-                w = abs(w[w != 0])
-                n = len(w)
-                if n == 0:
-                    print(f'Weights for {name} are only zeros, ignoring.')
-                    break
-                if fmt == 'longform':
-                    data['x'].extend(w.tolist())
-                    data['layer'].extend([name] * n)
-                    data['weight'].extend([label] * n)
-                elif fmt == 'summary':
-                    data.append(array_to_summary(w, fmt=plot))
-                    data[-1]['layer'] = name
-                    data[-1]['weight'] = label
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def activations_torch(model, X, fmt='longform', plot='boxplot'):
-    X = torch.Tensor(X)
-    if fmt == 'longform':
-        data = {'x': [], 'weight': []}
-    elif fmt == 'summary':
-        data = []
-
-    partial_model = torch.nn.Sequential
-    layers = []
-    for layer in model.children():
-        lname = layer.__class__.__name__
-        layers.append(layer)
-        pm = partial_model(*layers)
-        print(f"   {lname}")
-        y = pm(X).flatten().detach().numpy()
-        y = abs(y[y != 0])
-        if len(y) == 0:
-            print(f'Activations for {lname} are only zeros, ignoring.')
-            continue
-        if fmt == 'longform':
-            data['x'].extend(y.tolist())
-            data['weight'].extend([lname for _ in range(len(y))])
-        elif fmt == 'summary':
-            data.append(array_to_summary(y, fmt=plot))
-            data[-1]['weight'] = lname
-
-    if fmt == 'longform':
-        data = pandas.DataFrame(data)
-    return data
-
-
-def numerical(model=None, hls_model=None, X=None, plot='boxplot'):
-    """Perform numerical profiling of a model.
-
-    Args:
-        model (optional): Keras of PyTorch model. Defaults to None.
-        hls_model (ModelGraph, optional): The ModelGraph to profile. Defaults to None.
-        X (ndarray, optional): Test data on which to evaluate the model to profile activations.
-            Must be formatted suitably for the ``model.predict(X)``. Defaults to None.
-        plot (str, optional): The type of plot to produce. Options are: 'boxplot' (default), 'violinplot', 'histogram',
-            'FacetGrid'. Defaults to 'boxplot'.
-
-    Returns:
-        tuple: The quadruple of produced figures. First weights and biases
-            for the pre- and post-optimization models respectively,
-            then activations for the pre- and post-optimization models
-            respectively. (Optimizations are applied to an ModelGraph by hls4ml,
-            a post-optimization ModelGraph is a final model).
-    """
-    wp, wph, ap, aph = None, None, None, None
-
-    hls_model_present = hls_model is not None and isinstance(hls_model, ModelGraph)
-    model_present = model is not None
-
-    if hls_model_present:
-        before = " (before optimization)"
-        after = " (final / after optimization)"
-        hls_model_unoptimized, tmp_output_dir = get_unoptimized_hlsmodel(hls_model)
-    else:
-        before = ""
-        after = ""
-        hls_model_unoptimized, tmp_output_dir = None, None
-
-    print("Profiling weights" + before)
-    data = None
-
-    if hls_model_present:
-        data = weights_hlsmodel(hls_model_unoptimized, fmt='summary', plot=plot)
-    elif model_present:
-        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
-            data = weights_keras(model, fmt='summary', plot=plot)
-        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
-            data = weights_torch(model, fmt='summary', plot=plot)
-
-    if data is None:
-        print("Only keras, PyTorch (Sequential) and ModelGraph models " + "can currently be profiled")
-
-        if hls_model_present and os.path.exists(tmp_output_dir):
-            shutil.rmtree(tmp_output_dir)
-
-        return wp, wph, ap, aph
-
-    wp = plots[plot](data, fmt='summary')  # weight plot
-
-    if hls_model_present and plot in types_plots:
-        t_data = types_hlsmodel(hls_model_unoptimized)
-        types_plots[plot](t_data, fmt='summary')
-
-    plt.title("Distribution of (non-zero) weights" + before)
-    plt.tight_layout()
-
-    if hls_model_present:
-        print("Profiling weights" + after)
-
-        data = weights_hlsmodel(hls_model, fmt='summary', plot=plot)
-        wph = plots[plot](data, fmt='summary')  # weight plot
-
-        if plot in types_plots:
-            t_data = types_hlsmodel(hls_model)
-            types_plots[plot](t_data, fmt='summary')
-
-        plt.title("Distribution of (non-zero) weights" + after)
-        plt.tight_layout()
-
-    if X is not None:
-        print("Profiling activations" + before)
-        data = None
-        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
-            data = activations_keras(model, X, fmt='summary', plot=plot)
-        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
-            data = activations_torch(model, X, fmt='summary', plot=plot)
-
-        if data is not None:
-            ap = plots[plot](data, fmt='summary')  # activation plot
-            if hls_model_present and plot in types_plots:
-                t_data = activation_types_hlsmodel(hls_model_unoptimized)
-                types_plots[plot](t_data, fmt='summary')
-            plt.title("Distribution of (non-zero) activations" + before)
-            plt.tight_layout()
-
-        if hls_model_present:
-            print("Profiling activations" + after)
-            data = activations_hlsmodel(hls_model, X, fmt='summary', plot=plot)
-            aph = plots[plot](data, fmt='summary')
-
-            t_data = activation_types_hlsmodel(hls_model)
-            types_plots[plot](t_data, fmt='summary')
-
-            plt.title("Distribution of (non-zero) activations (final / after optimization)")
-            plt.tight_layout()
-
-    if hls_model_present and os.path.exists(tmp_output_dir):
-        shutil.rmtree(tmp_output_dir)
-
-    return wp, wph, ap, aph
-
-
-#########
-# COMPARE OUTPUT IMPLEMENTATION
-#########
-def _is_ignored_layer(layer):
-    """Some layers need to be ingored during inference"""
-    if isinstance(layer, (keras.layers.InputLayer, keras.layers.Dropout)):
-        return True
-    return False
-
-
-def _get_outputs(layers, X, model_input):
-    """Get outputs of intermediate layers"""
-    partial_models = keras.models.Model(inputs=model_input, outputs=[layer.output for layer in layers])
-    y = partial_models.predict(X)
-    return y
-
-
-def get_ymodel_keras(keras_model, X):
-    """Calculate each layer's ouput and put them into a dictionary.
-
-    Args:
-        keras_model (_type_): A keras Model
-        X (ndarray): Test data on which to evaluate the model to profile activations.
-            Must be formatted suitably for the ``model.predict(X)``.
-
-    Returns:
-        dict: A dictionary in the form {"layer_name": ouput array of layer}.
-    """
-    ymodel = {}
-    traced_layers = []
-    layer_names = []
-    for layer in keras_model.layers:
-        if _is_ignored_layer(layer):
-            continue
-        # If the layer has activation integrated then separate them
-        # Note that if the layer is a standalone activation layer then skip this
-        name = layer.name
-        if (
-            hasattr(layer, 'activation')
-            and layer.activation is not None
-            and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
-            and layer.activation.__name__ != 'linear'
-        ):
-            tmp_activation = layer.activation
-            layer.activation = None
-            ymodel.update({layer.name: _get_outputs([layer], X, keras_model.input)})
-            layer.activation = tmp_activation
-            name = layer.name + f"_{tmp_activation.__name__}"
-        traced_layers.append(layer)
-        layer_names.append(name)
-    outputs = _get_outputs(traced_layers, X, keras_model.input)
-    for name, output in zip(layer_names, outputs):
-        ymodel[name] = output
-    print("Done taking outputs for Keras model.")
-    return ymodel
-
-
-def _norm_diff(ymodel, ysim):
-    """Calculate the square root of the sum of the squares of the differences"""
-    diff = {}
-
-    for key in list(ysim.keys()):
-        diff[key] = np.linalg.norm(ysim[key] - ymodel[key])
-
-    # ---Bar Plot---
-    f, ax = plt.subplots()
-    plt.bar(list(diff.keys()), list(diff.values()))
-    plt.title("layer-by-layer output differences")
-    ax.set_ylabel('Norm of difference vector')
-    plt.xticks(rotation=90)
-    plt.tight_layout()
-    return f
-
-
-def _dist_diff(ymodel, ysim):
-    """
-    Calculate the normalized distribution of the differences of the elements
-    of the output vectors.
-    If difference >= original value then the normalized difference will be set to 1,
-    meaning "very difference".
-    If difference < original value then the normalized difference would be difference/original.
-    """
-
-    diff = {}
-
-    for key in list(ysim.keys()):
-        flattened_ysim = ysim[key].flatten()
-        flattened_ymodel = np.array(ymodel[key]).flatten()
-
-        diff[key] = np.absolute(flattened_ymodel - flattened_ysim) / np.linalg.norm(flattened_ymodel - flattened_ysim)
-        diff_vector = np.absolute(flattened_ymodel - flattened_ysim)
-        abs_ymodel = np.absolute(flattened_ymodel)
-
-        normalized_diff = np.zeros(diff_vector.shape)
-        normalized_diff[(diff_vector >= abs_ymodel) & (abs_ymodel > 0) & (diff_vector > 0)] = 1
-
-        # Fill out the rest
-        index = diff_vector < abs_ymodel
-        normalized_diff[index] = diff_vector[index] / abs_ymodel[index]
-
-        diff[key] = normalized_diff
-
-    # ---Box Plot---
-    f, ax = plt.subplots()
-    pos = np.array(range(len(list(diff.values())))) + 1
-    ax.boxplot(list(diff.values()), sym='k+', positions=pos)
-
-    # --formatting
-    plt.title("Layer-by-layer distribution of output differences")
-    ax.set_xticklabels(list(diff.keys()))
-    ax.set_ylabel('Normalized difference')
-    ax.set_ylabel('Percent difference.')
-    plt.xticks(rotation=90)
-    plt.tight_layout()
-
-    return f
-
-
-def compare(keras_model, hls_model, X, plot_type="dist_diff"):
-    """Compare each layer's output in keras and hls model. Note that the hls_model should not be compiled before using this.
-
-    Args:
-        keras_model: Original keras model.
-        hls_model (ModelGraph): Converted ModelGraph, with "Trace:True" in the configuration file.
-        X (ndarray): Input tensor for the model.
-        plot_type (str, optional): Different methods to visualize the y_model and y_sim differences.
-            Possible options include:
-            - 'norm_diff':: square root of the sum of the squares of the differences between each output vectors.
-            - 'dist_diff':: The normalized distribution of the differences of the elements between two output vectors.
-            Defaults to "dist_diff".
-
-    Returns:
-        matplotlib figure: Plot object of the histogram depicting the difference in each layer's output.
-    """
-
-    # Take in output from both models
-    # Note that each y is a dictionary with structure {"layer_name": flattened ouput array}
-    ymodel = get_ymodel_keras(keras_model, X)
-    _, ysim = hls_model.trace(X)
-
-    print("Plotting difference...")
-    f = plt.figure()
-    if plot_type == "norm_diff":
-        f = _norm_diff(ymodel, ysim)
-    elif plot_type == "dist_diff":
-        f = _dist_diff(ymodel, ysim)
-
-    return f
+import json
+import os
+import shutil
+import uuid
+from collections import defaultdict
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas
+import seaborn as sb
+
+from hls4ml.model.graph import ModelGraph
+from hls4ml.model.layers import GRU, LSTM, SeparableConv1D, SeparableConv2D
+
+try:
+    import qkeras
+    from tensorflow import keras
+
+    __tf_profiling_enabled__ = True
+except ImportError:
+    __tf_profiling_enabled__ = False
+
+try:
+    import torch
+
+    __torch_profiling_enabled__ = True
+except ImportError:
+    __torch_profiling_enabled__ = False
+
+
+def get_unoptimized_hlsmodel(model):
+    from hls4ml.converters import convert_from_config
+
+    new_config = model.config.config.copy()
+    new_config['HLSConfig'] = json.loads(json.dumps(new_config['HLSConfig']))
+
+    new_output_dir = uuid.uuid4().hex
+
+    while os.path.exists(new_output_dir):
+        new_output_dir = uuid.uuid4().hex
+
+    if 'SkipOptimizers' in new_config['HLSConfig']:
+        del new_config['HLSConfig']['SkipOptimizers']
+
+    new_config['HLSConfig']['Optimizers'] = []
+    new_config['OutputDir'] = new_output_dir
+
+    return convert_from_config(new_config), new_output_dir
+
+
+def array_to_summary(x, fmt='boxplot'):
+    if fmt == 'boxplot':
+        y = {'med': np.median(x), 'q1': np.percentile(x, 25), 'q3': np.percentile(x, 75), 'whislo': min(x), 'whishi': max(x)}
+    elif fmt == 'histogram':
+        # Power of 2 bins covering data range
+        high = np.ceil(np.log2(max(x))) + 1
+        low = np.floor(np.log2(min(x))) - 1
+        bits = np.arange(low, high, 1)
+        bins = 2**bits
+        h, b = np.histogram(x, bins=bins)
+        h = h * 1.0 / float(sum(h))  # normalize
+        y = {'h': h, 'b': np.log2(b)}
+    return y
+
+
+def boxplot(data, fmt='longform'):
+    if fmt == 'longform':
+        f = plt.figure()  # figsize=(3, 3))
+        hue = 'layer' if 'layer' in data.keys() else None
+        vp = sb.boxplot(x='x', y='weight', hue=hue, data=data[data['x'] > 0], showfliers=False)
+        vp.set_yticklabels(vp.get_yticklabels(), rotation=45, ha='right')
+        if hue is not None:
+            vp.get_legend().remove()
+        vp.set_xscale('log', base=2)
+        return f
+    elif fmt == 'summary':
+        from matplotlib.patches import Rectangle
+
+        medianprops = dict(linestyle='-', color='k')
+        f, ax = plt.subplots(1, 1)
+        data.reverse()
+        colors = sb.color_palette("Blues", len(data))
+        bp = ax.bxp(data, showfliers=False, vert=False, medianprops=medianprops)
+        # add colored boxes
+        for line, color in zip(bp['boxes'], colors):
+            x = line.get_xdata()
+            xl, xh = min(x), max(x)
+            y = line.get_ydata()
+            yl, yh = min(y), max(y)
+            rect = Rectangle((xl, yl), (xh - xl), (yh - yl), fill=True, color=color)
+            ax.add_patch(rect)
+        ax.set_yticklabels([d['weight'] for d in data])
+        ax.set_xscale('log', base=2)
+        plt.xlabel('x')
+        return f
+    else:
+        return None
+
+
+def histogram(data, fmt='longform'):
+    f = plt.figure()
+    from matplotlib.ticker import MaxNLocator
+
+    n = len(data) if fmt == 'summary' else len(data['weight'].unique())
+    colors = sb.color_palette("husl", n)
+    if fmt == 'longform':
+        for i, weight in enumerate(data['weight'].unique()):
+            y = array_to_summary(data[data['weight'] == weight]['x'], fmt='histogram')
+            plt.bar(y['b'][:-1], y['h'], width=1, fill=False, label=weight, edgecolor=colors[i])
+    elif fmt == 'summary':
+        for i, weight in enumerate(data):
+            plt.bar(weight['b'][:-1], weight['h'], width=1, fill=False, label=weight['weight'], edgecolor=colors[i])
+
+    plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
+    plt.xlabel('log2(x)')
+    plt.ylabel('frequency')
+    plt.legend()
+    return f
+
+
+plots = {'boxplot': boxplot, 'histogram': histogram}
+
+
+def types_boxplot(data, fmt='longform'):
+    from matplotlib.patches import PathPatch, Rectangle
+
+    ax = plt.gca()
+    _ = plt.gcf()
+    # Scale the data
+    data['low'] = 2.0 ** data['low']
+    data['high'] = 2.0 ** data['high']
+
+    # Plot the custom precisions
+    ticks = np.array([tick.get_text() for tick in plt.yticks()[1]])
+    # Get the coordinates of the boxes to place the markers
+    if fmt == 'longform':
+        # seaborn adjusts the box positions slightly in groups
+        boxes = [c.get_extents().inverse_transformed(ax.transData) for c in ax.get_children() if isinstance(c, PathPatch)]
+        ys = [(box.y0 + box.y1) / 2 for box in boxes]
+        ys = [(y, y) for y in ys]
+    elif fmt == 'summary':
+        ys = [(y, y) for y in plt.yticks()[0]]
+    for _irow, row in data[data['layer'] != 'model'].iterrows():
+        if row['layer'] in ticks:
+            iy = np.argwhere(ticks == row['layer'])[0][0]  # Determine which layer in the plot
+            rectangle = Rectangle(
+                (row['low'], ys[iy][0] - 0.4), row['high'] - row['low'], 0.8, fill=True, color='grey', alpha=0.2
+            )
+            ax.add_patch(rectangle)
+
+
+def types_histogram(data, fmt='longform'):
+    ax = plt.gca()
+    layers = np.array(ax.get_legend_handles_labels()[1])
+    colors = sb.color_palette("husl", len(layers))
+    ylim = ax.get_ylim()
+    for _irow, row in data[data['layer'] != 'model'].iterrows():
+        if row['layer'] in layers:
+            col = colors[np.argwhere(layers == row['layer'])[0][0]]
+            plt.plot((row['low'], row['low']), ylim, '--', color=col)
+            plt.plot((row['high'], row['high']), ylim, '--', color=col)
+
+
+types_plots = {'boxplot': types_boxplot, 'histogram': types_histogram}
+
+
+def ap_fixed_WIFS(dtype):
+    from hls4ml.backends import VivadoBackend
+
+    dtype = VivadoBackend.convert_precision_string(dtype)
+    W, I, F, S = dtype.width, dtype.integer, dtype.fractional, dtype.signed
+    return W, I, F, S
+
+
+def types_hlsmodel(model):
+    data = {'layer': [], 'low': [], 'high': []}
+    # Plot the default precision
+    default_precision = model.config.model_precision['default']
+    W, I, F, S = ap_fixed_WIFS(default_precision)
+    data['layer'].append('model')
+    data['low'].append(-F)
+    data['high'].append(I - 1 if S else I)
+
+    for layer in model.get_layers():
+        if isinstance(layer, GRU) or isinstance(layer, LSTM):
+            suffix = ['w', 'rw', 'b', 'rb']
+        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
+            suffix = ['dw', 'pw', 'db', 'pb']
+        else:
+            suffix = ['w', 'b']
+        for iw, weight in enumerate(layer.get_weights()):
+            wname = f'{layer.name}/{suffix[iw]}'
+            T = weight.type
+            if T.name != 'model':
+                W, I, F, S = ap_fixed_WIFS(T.precision)
+                data['layer'].append(wname)
+                data['low'].append(-F)
+                data['high'].append(I - 1 if S else I)
+    data = pandas.DataFrame(data)
+    return data
+
+
+def activation_types_hlsmodel(model):
+    data = {'layer': [], 'low': [], 'high': []}
+    # Get the default precision
+    default_precision = model.config.model_precision['default']
+    W, I, F, S = ap_fixed_WIFS(default_precision)
+    data['layer'].append('model')
+    data['low'].append(-F)
+    data['high'].append(I - 1 if S else I)
+    for layer in model.get_layers():
+        T = layer.get_output_variable().type.precision
+        W, I, F, S = ap_fixed_WIFS(T)
+        data['layer'].append(layer.name)
+        data['low'].append(-F)
+        data['high'].append(I - 1 if S else I)
+    data = pandas.DataFrame(data)
+    return data
+
+
+def weights_hlsmodel(model, fmt='longform', plot='boxplot'):
+    if fmt == 'longform':
+        data = {'x': [], 'layer': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+
+    for layer in model.get_layers():
+        if isinstance(layer, GRU) or isinstance(layer, LSTM):
+            suffix = ['w', 'rw', 'b', 'rb']
+        elif isinstance(layer, SeparableConv1D) or isinstance(layer, SeparableConv2D):
+            suffix = ['dw', 'pw', 'db', 'pb']
+        else:
+            suffix = ['w', 'b']
+        name = layer.name
+        for iw, weight in enumerate(layer.get_weights()):
+            label = f'{name}/{suffix[iw]}'
+            w = weight.data.flatten()
+            w = abs(w[w != 0])
+            n = len(w)
+            if n == 0:
+                print(f'Weights for {name} are only zeros, ignoring.')
+                break
+            if fmt == 'longform':
+                data['x'].extend(w.tolist())
+                data['layer'].extend([name] * len(w))
+                data['weight'].extend([label] * len(w))
+            elif fmt == 'summary':
+                data.append(array_to_summary(w, fmt=plot))
+                data[-1]['layer'] = name
+                data[-1]['weight'] = label
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def _keras_batchnorm(layer):
+    weights = layer.get_weights()
+    epsilon = layer.epsilon
+
+    gamma = weights[0]
+    beta = weights[1]
+    mean = weights[2]
+    var = weights[3]
+
+    scale = gamma / np.sqrt(var + epsilon)
+    bias = beta - gamma * mean / np.sqrt(var + epsilon)
+
+    return [scale, bias], ['s', 'b']
+
+
+def _keras_layer(layer):
+    return layer.get_weights(), ['w', 'b']
+
+
+def _keras_layernorm(layer):
+    weights = layer.get_weights()
+
+    gamma = weights[0]
+    beta = weights[1]
+
+    scale = gamma
+    bias = beta
+
+    return [scale, bias], ['s', 'b']
+
+
+def _keras_lstm(layer):
+    return layer.get_weights(), ['w', 'u', 'b']
+
+
+keras_process_layer_map = defaultdict(
+    lambda: _keras_layer,
+    {
+        'BatchNormalization': _keras_batchnorm,
+        'QBatchNormalization': _keras_batchnorm,
+        'LayerNormalization': _keras_layernorm,
+        'LSTM': _keras_lstm,
+        'QLSTM': _keras_lstm,
+    },
+)
+
+
+def activations_hlsmodel(model, X, fmt='summary', plot='boxplot'):
+    if fmt == 'longform':
+        raise NotImplementedError
+    elif fmt == 'summary':
+        data = []
+
+    _, trace = model.trace(np.ascontiguousarray(X))
+
+    if len(trace) == 0:
+        raise RuntimeError("ModelGraph must have tracing on for at least 1 layer (this can be set in its config)")
+
+    for layer in trace.keys():
+        print(f"   {layer}")
+
+        if fmt == 'summary':
+            y = trace[layer].flatten()
+            y = abs(y[y != 0])
+
+            if len(y) == 0:
+                print(f'Activations for {layer} are only zeros, ignoring.')
+                continue
+
+            data.append(array_to_summary(y, fmt=plot))
+            data[-1]['weight'] = layer
+
+    return data
+
+
+def weights_keras(model, fmt='longform', plot='boxplot'):
+    if fmt == 'longform':
+        data = {'x': [], 'layer': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+    for layer in model.layers:
+        name = layer.name
+        weights, suffix = keras_process_layer_map[type(layer).__name__](layer)
+
+        for i, w in enumerate(weights):
+            label = f'{name}/{suffix[i]}'
+            w = w.flatten()
+            w = abs(w[w != 0])
+            n = len(w)
+            if n == 0:
+                print(f'Weights for {name} are only zeros, ignoring.')
+                break
+            if fmt == 'longform':
+                data['x'].extend(w.tolist())
+                data['layer'].extend([name] * n)
+                data['weight'].extend([label] * n)
+            elif fmt == 'summary':
+                data.append(array_to_summary(w, fmt=plot))
+                data[-1]['layer'] = name
+                data[-1]['weight'] = label
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def activations_keras(model, X, fmt='longform', plot='boxplot'):
+    # test layer by layer on data
+    if fmt == 'longform':
+        # return long form pandas dataframe for
+        # seaborn boxplot
+        data = {'x': [], 'weight': []}
+    elif fmt == 'summary':
+        # return summary statistics for matplotlib.axes.Axes.bxp
+        # or histogram bin edges and heights
+        data = []
+    outputs = _get_outputs(
+        [layer for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], X, model.input
+    )
+    outputs = dict(zip([layer.name for layer in model.layers if not isinstance(layer, keras.layers.InputLayer)], outputs))
+    for layer_name, y in outputs.items():
+        print(f"   {layer_name}")
+        y = y.flatten()
+        y = abs(y[y != 0])
+        if len(y) == 0:
+            print(f'Activations for {layer_name} are only zeros, ignoring.')
+            continue
+        if fmt == 'longform':
+            data['x'].extend(y.tolist())
+            data['weight'].extend([layer_name for i in range(len(y))])
+        elif fmt == 'summary':
+            data.append(array_to_summary(y, fmt=plot))
+            data[-1]['weight'] = layer_name
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def weights_torch(model, fmt='longform', plot='boxplot'):
+    suffix = ['w', 'b']
+    if fmt == 'longform':
+        data = {'x': [], 'layer': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+    for layer in model.children():
+        if isinstance(layer, torch.nn.Linear):
+            name = layer.__class__.__name__
+            weights = list(layer.parameters())
+            for i, w in enumerate(weights):
+                label = f'{name}/{suffix[i]}'
+                w = weights[i].detach().numpy()
+                w = w.flatten()
+                w = abs(w[w != 0])
+                n = len(w)
+                if n == 0:
+                    print(f'Weights for {name} are only zeros, ignoring.')
+                    break
+                if fmt == 'longform':
+                    data['x'].extend(w.tolist())
+                    data['layer'].extend([name] * n)
+                    data['weight'].extend([label] * n)
+                elif fmt == 'summary':
+                    data.append(array_to_summary(w, fmt=plot))
+                    data[-1]['layer'] = name
+                    data[-1]['weight'] = label
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def activations_torch(model, X, fmt='longform', plot='boxplot'):
+    X = torch.Tensor(X)
+    if fmt == 'longform':
+        data = {'x': [], 'weight': []}
+    elif fmt == 'summary':
+        data = []
+
+    partial_model = torch.nn.Sequential
+    layers = []
+    for layer in model.children():
+        lname = layer.__class__.__name__
+        layers.append(layer)
+        pm = partial_model(*layers)
+        print(f"   {lname}")
+        y = pm(X).flatten().detach().numpy()
+        y = abs(y[y != 0])
+        if len(y) == 0:
+            print(f'Activations for {lname} are only zeros, ignoring.')
+            continue
+        if fmt == 'longform':
+            data['x'].extend(y.tolist())
+            data['weight'].extend([lname for _ in range(len(y))])
+        elif fmt == 'summary':
+            data.append(array_to_summary(y, fmt=plot))
+            data[-1]['weight'] = lname
+
+    if fmt == 'longform':
+        data = pandas.DataFrame(data)
+    return data
+
+
+def numerical(model=None, hls_model=None, X=None, plot='boxplot'):
+    """Perform numerical profiling of a model.
+
+    Args:
+        model (optional): Keras of PyTorch model. Defaults to None.
+        hls_model (ModelGraph, optional): The ModelGraph to profile. Defaults to None.
+        X (ndarray, optional): Test data on which to evaluate the model to profile activations.
+            Must be formatted suitably for the ``model.predict(X)``. Defaults to None.
+        plot (str, optional): The type of plot to produce. Options are: 'boxplot' (default), 'violinplot', 'histogram',
+            'FacetGrid'. Defaults to 'boxplot'.
+
+    Returns:
+        tuple: The quadruple of produced figures. First weights and biases
+            for the pre- and post-optimization models respectively,
+            then activations for the pre- and post-optimization models
+            respectively. (Optimizations are applied to an ModelGraph by hls4ml,
+            a post-optimization ModelGraph is a final model).
+    """
+    wp, wph, ap, aph = None, None, None, None
+
+    hls_model_present = hls_model is not None and isinstance(hls_model, ModelGraph)
+    model_present = model is not None
+
+    if hls_model_present:
+        before = " (before optimization)"
+        after = " (final / after optimization)"
+        hls_model_unoptimized, tmp_output_dir = get_unoptimized_hlsmodel(hls_model)
+    else:
+        before = ""
+        after = ""
+        hls_model_unoptimized, tmp_output_dir = None, None
+
+    print("Profiling weights" + before)
+    data = None
+
+    if hls_model_present:
+        data = weights_hlsmodel(hls_model_unoptimized, fmt='summary', plot=plot)
+    elif model_present:
+        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
+            data = weights_keras(model, fmt='summary', plot=plot)
+        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
+            data = weights_torch(model, fmt='summary', plot=plot)
+
+    if data is None:
+        print("Only keras, PyTorch (Sequential) and ModelGraph models " + "can currently be profiled")
+
+        if hls_model_present and os.path.exists(tmp_output_dir):
+            shutil.rmtree(tmp_output_dir)
+
+        return wp, wph, ap, aph
+
+    wp = plots[plot](data, fmt='summary')  # weight plot
+
+    if hls_model_present and plot in types_plots:
+        t_data = types_hlsmodel(hls_model_unoptimized)
+        types_plots[plot](t_data, fmt='summary')
+
+    plt.title("Distribution of (non-zero) weights" + before)
+    plt.tight_layout()
+
+    if hls_model_present:
+        print("Profiling weights" + after)
+
+        data = weights_hlsmodel(hls_model, fmt='summary', plot=plot)
+        wph = plots[plot](data, fmt='summary')  # weight plot
+
+        if plot in types_plots:
+            t_data = types_hlsmodel(hls_model)
+            types_plots[plot](t_data, fmt='summary')
+
+        plt.title("Distribution of (non-zero) weights" + after)
+        plt.tight_layout()
+
+    if X is not None:
+        print("Profiling activations" + before)
+        data = None
+        if __tf_profiling_enabled__ and isinstance(model, keras.Model):
+            data = activations_keras(model, X, fmt='summary', plot=plot)
+        elif __torch_profiling_enabled__ and isinstance(model, torch.nn.Sequential):
+            data = activations_torch(model, X, fmt='summary', plot=plot)
+
+        if data is not None:
+            ap = plots[plot](data, fmt='summary')  # activation plot
+            if hls_model_present and plot in types_plots:
+                t_data = activation_types_hlsmodel(hls_model_unoptimized)
+                types_plots[plot](t_data, fmt='summary')
+            plt.title("Distribution of (non-zero) activations" + before)
+            plt.tight_layout()
+
+        if hls_model_present:
+            print("Profiling activations" + after)
+            data = activations_hlsmodel(hls_model, X, fmt='summary', plot=plot)
+            aph = plots[plot](data, fmt='summary')
+
+            t_data = activation_types_hlsmodel(hls_model)
+            types_plots[plot](t_data, fmt='summary')
+
+            plt.title("Distribution of (non-zero) activations (final / after optimization)")
+            plt.tight_layout()
+
+    if hls_model_present and os.path.exists(tmp_output_dir):
+        shutil.rmtree(tmp_output_dir)
+
+    return wp, wph, ap, aph
+
+
+#########
+# COMPARE OUTPUT IMPLEMENTATION
+#########
+def _is_ignored_layer(layer):
+    """Some layers need to be ingored during inference"""
+    if isinstance(layer, (keras.layers.InputLayer, keras.layers.Dropout)):
+        return True
+    return False
+
+
+def _get_outputs(layers, X, model_input):
+    """Get outputs of intermediate layers"""
+    partial_models = keras.models.Model(inputs=model_input, outputs=[layer.output for layer in layers])
+    y = partial_models.predict(X)
+    return y
+
+
+def get_ymodel_keras(keras_model, X):
+    """Calculate each layer's ouput and put them into a dictionary.
+
+    Args:
+        keras_model (_type_): A keras Model
+        X (ndarray): Test data on which to evaluate the model to profile activations.
+            Must be formatted suitably for the ``model.predict(X)``.
+
+    Returns:
+        dict: A dictionary in the form {"layer_name": ouput array of layer}.
+    """
+    ymodel = {}
+    traced_layers = []
+    layer_names = []
+    for layer in keras_model.layers:
+        if _is_ignored_layer(layer):
+            continue
+        # If the layer has activation integrated then separate them
+        # Note that if the layer is a standalone activation layer then skip this
+        name = layer.name
+        if (
+            hasattr(layer, 'activation')
+            and layer.activation is not None
+            and not isinstance(layer, (keras.layers.Activation, qkeras.qlayers.QActivation))
+            and layer.activation.__name__ != 'linear'
+        ):
+            tmp_activation = layer.activation
+            layer.activation = None
+            ymodel.update({layer.name: _get_outputs([layer], X, keras_model.input)})
+            layer.activation = tmp_activation
+            name = layer.name + f"_{tmp_activation.__name__}"
+        traced_layers.append(layer)
+        layer_names.append(name)
+    outputs = _get_outputs(traced_layers, X, keras_model.input)
+    for name, output in zip(layer_names, outputs):
+        ymodel[name] = output
+    print("Done taking outputs for Keras model.")
+    return ymodel
+
+
+def _norm_diff(ymodel, ysim):
+    """Calculate the square root of the sum of the squares of the differences"""
+    diff = {}
+
+    for key in list(ysim.keys()):
+        diff[key] = np.linalg.norm(ysim[key] - ymodel[key])
+
+    # ---Bar Plot---
+    f, ax = plt.subplots()
+    plt.bar(list(diff.keys()), list(diff.values()))
+    plt.title("layer-by-layer output differences")
+    ax.set_ylabel('Norm of difference vector')
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+    return f
+
+
+def _dist_diff(ymodel, ysim):
+    """
+    Calculate the normalized distribution of the differences of the elements
+    of the output vectors.
+    If difference >= original value then the normalized difference will be set to 1,
+    meaning "very difference".
+    If difference < original value then the normalized difference would be difference/original.
+    """
+
+    diff = {}
+
+    for key in list(ysim.keys()):
+        flattened_ysim = ysim[key].flatten()
+        flattened_ymodel = np.array(ymodel[key]).flatten()
+
+        diff[key] = np.absolute(flattened_ymodel - flattened_ysim) / np.linalg.norm(flattened_ymodel - flattened_ysim)
+        diff_vector = np.absolute(flattened_ymodel - flattened_ysim)
+        abs_ymodel = np.absolute(flattened_ymodel)
+
+        normalized_diff = np.zeros(diff_vector.shape)
+        normalized_diff[(diff_vector >= abs_ymodel) & (abs_ymodel > 0) & (diff_vector > 0)] = 1
+
+        # Fill out the rest
+        index = diff_vector < abs_ymodel
+        normalized_diff[index] = diff_vector[index] / abs_ymodel[index]
+
+        diff[key] = normalized_diff
+
+    # ---Box Plot---
+    f, ax = plt.subplots()
+    pos = np.array(range(len(list(diff.values())))) + 1
+    ax.boxplot(list(diff.values()), sym='k+', positions=pos)
+
+    # --formatting
+    plt.title("Layer-by-layer distribution of output differences")
+    ax.set_xticklabels(list(diff.keys()))
+    ax.set_ylabel('Normalized difference')
+    ax.set_ylabel('Percent difference.')
+    plt.xticks(rotation=90)
+    plt.tight_layout()
+
+    return f
+
+
+def compare(keras_model, hls_model, X, plot_type="dist_diff"):
+    """Compare each layer's output in keras and hls model. Note that the hls_model should not be compiled before using this.
+
+    Args:
+        keras_model: Original keras model.
+        hls_model (ModelGraph): Converted ModelGraph, with "Trace:True" in the configuration file.
+        X (ndarray): Input tensor for the model.
+        plot_type (str, optional): Different methods to visualize the y_model and y_sim differences.
+            Possible options include:
+            - 'norm_diff':: square root of the sum of the squares of the differences between each output vectors.
+            - 'dist_diff':: The normalized distribution of the differences of the elements between two output vectors.
+            Defaults to "dist_diff".
+
+    Returns:
+        matplotlib figure: Plot object of the histogram depicting the difference in each layer's output.
+    """
+
+    # Take in output from both models
+    # Note that each y is a dictionary with structure {"layer_name": flattened ouput array}
+    ymodel = get_ymodel_keras(keras_model, X)
+    _, ysim = hls_model.trace(X)
+
+    print("Plotting difference...")
+    f = plt.figure()
+    if plot_type == "norm_diff":
+        f = _norm_diff(ymodel, ysim)
+    elif plot_type == "dist_diff":
+        f = _dist_diff(ymodel, ysim)
+
+    return f

From 0d96cb0e3df2cb5d6e64fe8df99e9372590e0aad Mon Sep 17 00:00:00 2001
From: Rian Brooks Flynn <112733140+rianbrooksflynn@users.noreply.github.com>
Date: Mon, 13 Jan 2025 14:44:07 -0500
Subject: [PATCH 55/55] add Vitis as another tested backend

---
 test/pytest/test_layernorm.py         | 11 ++++++-----
 test/pytest/test_layernorm_pytorch.py | 11 ++++++-----
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/test/pytest/test_layernorm.py b/test/pytest/test_layernorm.py
index f3f0a5731b..bc9290b164 100644
--- a/test/pytest/test_layernorm.py
+++ b/test/pytest/test_layernorm.py
@@ -27,12 +27,13 @@ def model():
     return model
 
 
-# Currently only Vivado in io_parallel mode is supported
-def test_layernorm(model, data):
-    config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend='Vivado')
-    output_dir = str(test_root_path / 'hls4mlprj_layernorm_Vivado_io_parallel')
+# Currently only Vivado/Vitis in io_parallel mode is supported
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+def test_layernorm(model, data, backend):
+    config = hls4ml.utils.config_from_keras_model(model, granularity='name', backend=backend)
+    output_dir = str(test_root_path / f'hls4mlprj_layernorm_{backend}_io_parallel')
     hls_model = hls4ml.converters.convert_from_keras_model(
-        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
+        model, backend=backend, hls_config=config, io_type='io_parallel', output_dir=output_dir
     )
     hls_model.compile()
 
diff --git a/test/pytest/test_layernorm_pytorch.py b/test/pytest/test_layernorm_pytorch.py
index d61b0c4361..ca2c9d68a7 100644
--- a/test/pytest/test_layernorm_pytorch.py
+++ b/test/pytest/test_layernorm_pytorch.py
@@ -26,12 +26,13 @@ def model():
     return model
 
 
-# Currently only Vivado in io_parallel mode is supported
-def test_layernorm(model, data):
-    config = hls4ml.utils.config_from_pytorch_model(model, in_shape, granularity='name', backend='Vivado')
-    output_dir = str(test_root_path / 'hls4mlprj_layernorm_pytorch_Vivado_io_parallel')
+# Currently only Vivado/Vitis in io_parallel mode is supported
+@pytest.mark.parametrize('backend', ['Vivado', 'Vitis'])
+def test_layernorm(model, data, backend):
+    config = hls4ml.utils.config_from_pytorch_model(model, in_shape, granularity='name', backend=backend)
+    output_dir = str(test_root_path / f'hls4mlprj_layernorm_pytorch_{backend}_io_parallel')
     hls_model = hls4ml.converters.convert_from_pytorch_model(
-        model, backend='Vivado', hls_config=config, io_type='io_parallel', output_dir=output_dir
+        model, backend=backend, hls_config=config, io_type='io_parallel', output_dir=output_dir
     )
     hls_model.compile()